diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000000..cdced8f1b1 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,8 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" + cooldown: + default-days: 7 \ No newline at end of file diff --git a/.github/workflows/mac_mpich.yml b/.github/workflows/mac_mpich.yml deleted file mode 100644 index 72ca9c9037..0000000000 --- a/.github/workflows/mac_mpich.yml +++ /dev/null @@ -1,206 +0,0 @@ -name: Mac OSX with MPICH - -on: - push: - branches: [ master, test_github_actions ] - paths-ignore: - - '**/*.md' - - '**/*.txt' - - '**/*.1' - - '**/*.jpg' - - '**/*.png' - - 'docs/*' - - 'test/test_installed/*' - pull_request: - branches: [ master, test_github_actions ] - paths-ignore: - - '**/*.md' - - '**/*.txt' - - '**/*.1' - - '**/*.jpg' - - '**/*.png' - - 'docs/*' - - 'test/test_installed/*' - -env: - MPICH_VERSION: 4.3.0 - AUTOCONF_VERSION: 2.71 - AUTOMAKE_VERSION: 1.17 - LIBTOOL_VERSION: 2.5.4 - M4_VERSION: 1.4.19 - -jobs: - build: - runs-on: macos-latest - timeout-minutes: 60 - steps: - - uses: actions/checkout@v4 - - name: Set up dependencies - run: | - # brew install gcc - # which gcc - # gcc --version - # which gfortran - - name: Clean up git untracked files - run: | - git clean -fx - - name: Build GNU autotools - run: | - export PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/bin:${PATH}" - export LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/lib:${LD_LIBRARY_PATH}" - cd ${GITHUB_WORKSPACE} - wget -q https://ftp.gnu.org/gnu/m4/m4-${M4_VERSION}.tar.gz - gzip -dc m4-${M4_VERSION}.tar.gz | tar -xf - - cd m4-${M4_VERSION} - ./configure --prefix=${GITHUB_WORKSPACE}/AUTOTOOLS \ - --silent - make -s -j 8 install > qout 2>&1 - make -s -j 8 distclean >> qout 2>&1 - cd ${GITHUB_WORKSPACE} - wget -q https://ftp.gnu.org/gnu/autoconf/autoconf-${AUTOCONF_VERSION}.tar.gz - gzip -dc autoconf-${AUTOCONF_VERSION}.tar.gz | tar -xf - - cd autoconf-${AUTOCONF_VERSION} - ./configure --prefix=${GITHUB_WORKSPACE}/AUTOTOOLS \ - --silent - make -s -j 8 install > qout 2>&1 - make -s -j 8 distclean >> qout 2>&1 - cd ${GITHUB_WORKSPACE} - wget -q https://ftp.gnu.org/gnu/automake/automake-${AUTOMAKE_VERSION}.tar.gz - gzip -dc automake-${AUTOMAKE_VERSION}.tar.gz | tar -xf - - cd automake-${AUTOMAKE_VERSION} - ./configure --prefix=${GITHUB_WORKSPACE}/AUTOTOOLS \ - --silent - make -s -j 8 install > qout 2>&1 - make -s -j 8 distclean >> qout 2>&1 - cd ${GITHUB_WORKSPACE} - wget -q https://ftp.gnu.org/gnu/libtool/libtool-${LIBTOOL_VERSION}.tar.gz - gzip -dc libtool-${LIBTOOL_VERSION}.tar.gz | tar -xf - - cd libtool-${LIBTOOL_VERSION} - ./configure --prefix=${GITHUB_WORKSPACE}/AUTOTOOLS \ - --silent - make -s -j 8 install > qout 2>&1 - make -s -j 8 distclean >> qout 2>&1 - - name: Build MPICH - run: | - cd ${GITHUB_WORKSPACE} - rm -rf MPICH ; mkdir MPICH ; cd MPICH - wget -q https://www.mpich.org/static/downloads/${MPICH_VERSION}/mpich-${MPICH_VERSION}.tar.gz - gzip -dc mpich-${MPICH_VERSION}.tar.gz | tar -xf - - cd mpich-${MPICH_VERSION} - ./configure --prefix=${GITHUB_WORKSPACE}/MPICH \ - --silent \ - --enable-romio \ - --with-file-system=ufs \ - --with-device=ch3:sock \ - --disable-fortran \ - CC=gcc - make -s LIBTOOLFLAGS=--silent V=1 -j 8 install > qout 2>&1 - make -s -j 8 distclean >> qout 2>&1 - - name: Build PnetCDF - run: | - cd ${GITHUB_WORKSPACE} - export PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/bin:${PATH}" - export LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/lib:${LD_LIBRARY_PATH}" - m4 --version - autoconf --version - automake --version - libtool --version - autoreconf -i - mkdir -p pnetcdf_output - ./configure --enable-option-checking=fatal \ - --enable-profiling \ - pnc_ac_debug=yes \ - --enable-burst_buffering \ - --enable-subfiling \ - --enable-thread-safe \ - --with-pthread \ - --disable-fortran \ - --with-mpi=${GITHUB_WORKSPACE}/MPICH \ - TESTOUTDIR=${GITHUB_WORKSPACE}/pnetcdf_output - make -j 8 tests - - name: Print config.log - if: ${{ always() }} - run: | - cat ${GITHUB_WORKSPACE}/config.log - - name: make check - run: | - cd ${GITHUB_WORKSPACE} - make check - - name: Print test log files - if: ${{ always() }} - run: | - cd ${GITHUB_WORKSPACE} - fname=`find src test examples benchmarks -type f -name "*.log"` - for f in $fname ; do \ - bname=`basename $f` ; \ - if test "x$bname" != xconfig.log ; then \ - echo "-------- dump $f ----------------------------" ; \ - cat $f ; \ - fi ; \ - done - - name: make ptests - run: | - cd ${GITHUB_WORKSPACE} - make ptests - - name: Build PnetCDF (default configuration) - run: | - cd ${GITHUB_WORKSPACE} - export PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/bin:${PATH}" - export LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/lib:${LD_LIBRARY_PATH}" - make distclean - rm -rf pnetcdf_output - mkdir -p pnetcdf_output - ./configure --disable-fortran \ - --with-mpi=${GITHUB_WORKSPACE}/MPICH \ - TESTOUTDIR=${GITHUB_WORKSPACE}/pnetcdf_output - make -j 8 tests - - name: Print config.log (default configuration) - if: ${{ always() }} - run: | - cat ${GITHUB_WORKSPACE}/config.log - - name: make check (default configuration) - run: | - cd ${GITHUB_WORKSPACE} - make check - - name: Print test log files (default configuration) - if: ${{ always() }} - run: | - cd ${GITHUB_WORKSPACE} - fname=`find src test examples benchmarks -type f -name "*.log"` - for f in $fname ; do \ - bname=`basename $f` ; \ - if test "x$bname" != xconfig.log ; then \ - echo "-------- dump $f ----------------------------" ; \ - cat $f ; \ - fi ; \ - done - - name: make ptests (default configuration) - run: | - cd ${GITHUB_WORKSPACE} - make ptests - - name: make distcheck - run: | - cd ${GITHUB_WORKSPACE} - make -j 8 distcheck DISTCHECK_CONFIGURE_FLAGS="--silent --with-mpi=${GITHUB_WORKSPACE}/MPICH" - - name: make install - run: | - cd ${GITHUB_WORKSPACE} - prefix_path=${GITHUB_WORKSPACE}/pnetcdf_install - echo "---- test make install prefix=${prefix_path}" - make install prefix=${prefix_path} - test/tst_install.sh ${prefix_path} - prefix_path="/pnetcdf_install" - destdir_path=${GITHUB_WORKSPACE}/inst - echo "---- test make install prefix=${prefix_path} DESTDIR=${destdir_path}" - make install prefix=${prefix_path} DESTDIR=${destdir_path} - test/tst_install.sh ${prefix_path} ${destdir_path} - - name: Cleanup - if: ${{ always() }} - run: | - cd ${GITHUB_WORKSPACE} - make -s distclean - rm -rf ${GITHUB_WORKSPACE}/pnetcdf_output - rm -rf ${GITHUB_WORKSPACE}/MPICH - rm -rf ${GITHUB_WORKSPACE}/pnetcdf_install - rm -rf ${GITHUB_WORKSPACE}/inst - diff --git a/.github/workflows/mac_openmpi.yml b/.github/workflows/mac_openmpi.yml deleted file mode 100644 index 65fcb10be2..0000000000 --- a/.github/workflows/mac_openmpi.yml +++ /dev/null @@ -1,208 +0,0 @@ -name: Mac OSX with OpenMPI - -on: - push: - branches: [ master, test_github_actions ] - paths-ignore: - - '**/*.md' - - '**/*.txt' - - '**/*.1' - - '**/*.jpg' - - '**/*.png' - - 'docs/*' - - 'test/test_installed/*' - pull_request: - branches: [ master, test_github_actions ] - paths-ignore: - - '**/*.md' - - '**/*.txt' - - '**/*.1' - - '**/*.jpg' - - '**/*.png' - - 'docs/*' - - 'test/test_installed/*' - -env: - OPENMPI_VERSION: 5.0.2 - AUTOCONF_VERSION: 2.71 - AUTOMAKE_VERSION: 1.17 - LIBTOOL_VERSION: 2.5.4 - M4_VERSION: 1.4.19 - -jobs: - build: - runs-on: macos-latest - timeout-minutes: 90 - steps: - - uses: actions/checkout@v4 - - name: Set up dependencies - run: | - # brew install gcc - # which gcc - # gcc --version - # which gfortran - - name: Clean up git untracked files - run: | - git clean -fx - - name: Build GNU autotools - run: | - export PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/bin:${PATH}" - export LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/lib:${LD_LIBRARY_PATH}" - cd ${GITHUB_WORKSPACE} - wget -q https://ftp.gnu.org/gnu/m4/m4-${M4_VERSION}.tar.gz - gzip -dc m4-${M4_VERSION}.tar.gz | tar -xf - - cd m4-${M4_VERSION} - ./configure --prefix=${GITHUB_WORKSPACE}/AUTOTOOLS \ - --silent - make -s -j 8 install > qout 2>&1 - make -s -j 8 distclean >> qout 2>&1 - cd ${GITHUB_WORKSPACE} - wget -q https://ftp.gnu.org/gnu/autoconf/autoconf-${AUTOCONF_VERSION}.tar.gz - gzip -dc autoconf-${AUTOCONF_VERSION}.tar.gz | tar -xf - - cd autoconf-${AUTOCONF_VERSION} - ./configure --prefix=${GITHUB_WORKSPACE}/AUTOTOOLS \ - --silent - make -s -j 8 install > qout 2>&1 - make -s -j 8 distclean >> qout 2>&1 - cd ${GITHUB_WORKSPACE} - wget -q https://ftp.gnu.org/gnu/automake/automake-${AUTOMAKE_VERSION}.tar.gz - gzip -dc automake-${AUTOMAKE_VERSION}.tar.gz | tar -xf - - cd automake-${AUTOMAKE_VERSION} - ./configure --prefix=${GITHUB_WORKSPACE}/AUTOTOOLS \ - --silent - make -s -j 8 install > qout 2>&1 - make -s -j 8 distclean >> qout 2>&1 - cd ${GITHUB_WORKSPACE} - wget -q https://ftp.gnu.org/gnu/libtool/libtool-${LIBTOOL_VERSION}.tar.gz - gzip -dc libtool-${LIBTOOL_VERSION}.tar.gz | tar -xf - - cd libtool-${LIBTOOL_VERSION} - ./configure --prefix=${GITHUB_WORKSPACE}/AUTOTOOLS \ - --silent - make -s -j 8 install > qout 2>&1 - make -s -j 8 distclean >> qout 2>&1 - - name: Build OPENMPI - run: | - cd ${GITHUB_WORKSPACE} - rm -rf OPENMPI ; mkdir OPENMPI ; cd OPENMPI - VER_MAJOR=${OPENMPI_VERSION%.*} - wget -q https://download.open-mpi.org/release/open-mpi/v${VER_MAJOR}/openmpi-${OPENMPI_VERSION}.tar.gz - gzip -dc openmpi-${OPENMPI_VERSION}.tar.gz | tar -xf - - cd openmpi-${OPENMPI_VERSION} - ./configure --prefix=${GITHUB_WORKSPACE}/OPENMPI \ - --silent \ - --with-io-romio-flags="--with-file-system=ufs" \ - --with-hwloc=internal \ - --with-pmix=internal \ - --with-libevent=internal \ - --disable-mpi-fortran \ - CC=gcc - make -s LIBTOOLFLAGS=--silent V=1 -j 8 install > qout 2>&1 - make -s -j 8 distclean >> qout 2>&1 - - name: Build PnetCDF - run: | - cd ${GITHUB_WORKSPACE} - export PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/bin:${PATH}" - export LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/lib:${LD_LIBRARY_PATH}" - m4 --version - autoconf --version - automake --version - libtool --version - autoreconf -i - mkdir -p pnetcdf_output - ./configure --enable-option-checking=fatal \ - --enable-profiling \ - pnc_ac_debug=yes \ - --enable-burst_buffering \ - --enable-subfiling \ - --enable-thread-safe \ - --with-pthread \ - --disable-fortran \ - --with-mpi=${GITHUB_WORKSPACE}/OPENMPI \ - TESTOUTDIR=${GITHUB_WORKSPACE}/pnetcdf_output - make -j 8 tests - - name: Print config.log - if: ${{ always() }} - run: | - cat ${GITHUB_WORKSPACE}/config.log - - name: make check - run: | - cd ${GITHUB_WORKSPACE} - make check - - name: Print test log files - if: ${{ always() }} - run: | - cd ${GITHUB_WORKSPACE} - fname=`find src test examples benchmarks -type f -name "*.log"` - for f in $fname ; do \ - bname=`basename $f` ; \ - if test "x$bname" != xconfig.log ; then \ - echo "-------- dump $f ----------------------------" ; \ - cat $f ; \ - fi ; \ - done - - name: make ptests - run: | - cd ${GITHUB_WORKSPACE} - make ptests - - name: Build PnetCDF (default configuration) - run: | - cd ${GITHUB_WORKSPACE} - export PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/bin:${PATH}" - export LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/lib:${LD_LIBRARY_PATH}" - make distclean - rm -rf pnetcdf_output - mkdir -p pnetcdf_output - ./configure --disable-fortran \ - --with-mpi=${GITHUB_WORKSPACE}/OPENMPI \ - TESTOUTDIR=${GITHUB_WORKSPACE}/pnetcdf_output - make -j 8 tests - - name: Print config.log (default configuration) - if: ${{ always() }} - run: | - cat ${GITHUB_WORKSPACE}/config.log - - name: make check (default configuration) - run: | - cd ${GITHUB_WORKSPACE} - make check - - name: Print test log files (default configuration) - if: ${{ always() }} - run: | - cd ${GITHUB_WORKSPACE} - fname=`find src test examples benchmarks -type f -name "*.log"` - for f in $fname ; do \ - bname=`basename $f` ; \ - if test "x$bname" != xconfig.log ; then \ - echo "-------- dump $f ----------------------------" ; \ - cat $f ; \ - fi ; \ - done - - name: make ptests (default configuration) - run: | - cd ${GITHUB_WORKSPACE} - make ptests - - name: make distcheck - run: | - cd ${GITHUB_WORKSPACE} - make -j 8 distcheck DISTCHECK_CONFIGURE_FLAGS="--silent --with-mpi=${GITHUB_WORKSPACE}/OPENMPI" - - name: make install - run: | - cd ${GITHUB_WORKSPACE} - prefix_path=${GITHUB_WORKSPACE}/pnetcdf_install - echo "---- test make install prefix=${prefix_path}" - make install prefix=${prefix_path} - test/tst_install.sh ${prefix_path} - prefix_path="/pnetcdf_install" - destdir_path=${GITHUB_WORKSPACE}/inst - echo "---- test make install prefix=${prefix_path} DESTDIR=${destdir_path}" - make install prefix=${prefix_path} DESTDIR=${destdir_path} - test/tst_install.sh ${prefix_path} ${destdir_path} - - name: Cleanup - if: ${{ always() }} - run: | - cd ${GITHUB_WORKSPACE} - make -s distclean - rm -rf ${GITHUB_WORKSPACE}/pnetcdf_output - rm -rf ${GITHUB_WORKSPACE}/OPENMPI - rm -rf ${GITHUB_WORKSPACE}/pnetcdf_install - rm -rf ${GITHUB_WORKSPACE}/inst - diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000000..bae4a86017 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,224 @@ +# Test Combinations of Ubuntu/MacOSX, MPICH/OpenMPI, and UFS/Lustre +name: CI - OS and MPI + +permissions: + contents: read + +on: + push: + branches: [ master ] + paths-ignore: + - '**/*.md' + - '**/*.txt' + - '**/*.1' + - '**/*.jpg' + - '**/*.png' + - 'docs/*' + - 'test/test_installed/*' + pull_request: + branches: [ master ] + paths-ignore: + - '**/*.md' + - '**/*.txt' + - '**/*.1' + - '**/*.jpg' + - '**/*.png' + - 'docs/*' + - 'test/test_installed/*' + +env: + LIBTOOL_VERSION: 2.5.4 + MPICH_LATEST: 4.3.2 + OPENMPI_LATEST: 5.0.9 + +jobs: + build: + strategy: + fail-fast: false # This disables the default cancel-on-failure behavior + matrix: + os: [ubuntu-latest, macos-latest] + mpi_vendor: [ MPICH, OpenMPI ] + fstype: [ ufs, mimic_lustre ] + + runs-on: ${{ matrix.os }} + timeout-minutes: 120 + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v4 + + - name: Install autotools + run: | + set -x + if test ${{ matrix.os }} == ubuntu-latest ; then + sudo apt-get update + sudo apt-get install autoconf + sudo apt-get install automake + sudo apt-get install m4 + + # sudo apt-get install libtool libtool-bin + wget -q https://ftp.gnu.org/gnu/libtool/libtool-${LIBTOOL_VERSION}.tar.gz + gzip -dc libtool-${LIBTOOL_VERSION}.tar.gz | tar -xf - + cd libtool-${LIBTOOL_VERSION} + ./configure --prefix=/usr --silent + sudo make -s LIBTOOLFLAGS=--silent V=1 -j 8 install > qout 2>&1 + sudo make -s LIBTOOLFLAGS=--silent V=1 -j 8 distclean >> qout 2>&1 + else + brew update + brew install coreutils + brew install autoconf + brew install automake + brew install libtool + export PATH="/opt/homebrew/opt/libtool/libexec/gnubin:${PATH}" + brew list m4 || brew install m4 + fi + which autoconf + autoconf --version + which automake + automake --version + which libtool + libtool --version + which m4 + m4 --version + + - name: Install MPI compiler vendor - ${{ matrix.mpi_vendor }} + run: | + set -x + if test "${{ matrix.os }}" = "macos-latest" ; then + # Must reinstall gcc to get gfortran installed + brew reinstall gcc + which gcc + gcc --version + which gfortran + gfortran --version + fi + if test "${{ matrix.mpi_vendor }}" = "MPICH" ; then + # MPICH versions older than 4.2.2 do not support the MPI large + # count feature. + echo "Install MPICH ${MPICH_LATEST} in ${GITHUB_WORKSPACE}/MPI" + wget -q https://www.mpich.org/static/downloads/${MPICH_LATEST}/mpich-${MPICH_LATEST}.tar.gz + gzip -dc mpich-${MPICH_LATEST}.tar.gz | tar -xf - + cd mpich-${MPICH_LATEST} + ./configure --prefix=${GITHUB_WORKSPACE}/MPI \ + --silent \ + --enable-romio \ + --with-file-system=ufs \ + --with-device=ch3:sock \ + --enable-fortran \ + CC=gcc FC=gfortran \ + FFLAGS=-fallow-argument-mismatch \ + FCFLAGS=-fallow-argument-mismatch + make -s LIBTOOLFLAGS=--silent V=1 -j 8 install > qout 2>&1 + make -s LIBTOOLFLAGS=--silent V=1 -j 8 distclean >> qout 2>&1 + else # OpenMPI + echo "Install OPENMPI ${OPENMPI_LATEST} in ${GITHUB_WORKSPACE}/MPI" + VER_MAJOR=${OPENMPI_LATEST%.*} + wget -q https://download.open-mpi.org/release/open-mpi/v${VER_MAJOR}/openmpi-${OPENMPI_LATEST}.tar.gz + gzip -dc openmpi-${OPENMPI_LATEST}.tar.gz | tar -xf - + cd openmpi-${OPENMPI_LATEST} + if test "${{ matrix.os }}" = "macos-latest" ; then + # When built on MacOS, additional configure options are required. + EXTRA_OPTS="--with-hwloc=internal --with-pmix=internal --with-libevent=internal" + fi + ./configure --prefix=${GITHUB_WORKSPACE}/MPI \ + --silent \ + --with-io-romio-flags="--with-file-system=ufs" \ + ${EXTRA_OPTS} \ + CC=gcc \ + FC=gfortran \ + FCFLAGS=-fallow-argument-mismatch + make -s LIBTOOLFLAGS=--silent V=1 -j 8 install > qout 2>&1 + make -s LIBTOOLFLAGS=--silent V=1 -j 8 distclean >> qout 2>&1 + fi + + - name: Clean up git untracked files + run: | + git clean -fx + + - name: Build PnetCDF - ${{ matrix.fstype }} + run: | + set -x + cd ${GITHUB_WORKSPACE} + if test ${{ matrix.os }} == macos-latest ; then + export PATH="/opt/homebrew/opt/libtool/libexec/gnubin:${PATH}" + fi + which autoconf + autoconf --version + which automake + automake --version + which libtool + libtool --version + which m4 + m4 --version + autoreconf -i + mkdir -p ${GITHUB_WORKSPACE}/pnetcdf_output + if test ${{ matrix.fstype }} == mimic_lustre ; then + export MIMIC_LUSTRE=yes + fi + ./configure --prefix=${GITHUB_WORKSPACE}/PnetCDF \ + --enable-option-checking=fatal \ + pnc_ac_debug=yes \ + --enable-burst_buffering \ + --enable-subfiling \ + --enable-thread-safe \ + --with-pthread \ + --with-mpi=${GITHUB_WORKSPACE}/MPI \ + TESTOUTDIR=${GITHUB_WORKSPACE}/pnetcdf_output + make -s LIBTOOLFLAGS=--silent V=1 -j 8 tests + + - name: Print config.log + if: ${{ always() }} + run: | + cat ${GITHUB_WORKSPACE}/config.log + + - name: make check + run: | + cd ${GITHUB_WORKSPACE} + make -s LIBTOOLFLAGS=--silent V=1 check + + - name: Print test log files + if: ${{ always() }} + run: | + cd ${GITHUB_WORKSPACE} + fname=`find src test examples benchmarks -type f -name "*.log"` + for f in $fname ; do \ + bname=`basename $f` ; \ + if test "x$bname" != xconfig.log ; then \ + echo "-------- dump $f ----------------------------" ; \ + cat $f ; \ + fi ; \ + done + + - name: make ptests + run: | + cd ${GITHUB_WORKSPACE} + if test "${{ matrix.mpi_vendor }}" = "OpenMPI" ; then + # OpenMPI hacks when running in parallel on MacOS + export OMPI_MCA_btl_tcp_if_include=lo0 + export TMPDIR=${RUNNER_TEMP} + fi + make -s LIBTOOLFLAGS=--silent V=1 ptests + + - name: make distcheck + run: | + cd ${GITHUB_WORKSPACE} + make -j 8 distcheck DISTCHECK_CONFIGURE_FLAGS="--silent --with-mpi=${GITHUB_WORKSPACE}/MPI" + + - name: make install + run: | + set -x + cd ${GITHUB_WORKSPACE} + prefix_path=${GITHUB_WORKSPACE}/pnetcdf_install + echo "---- test make install prefix=${prefix_path}" + make -s LIBTOOLFLAGS=--silent V=1 install prefix=${prefix_path} + test/tst_install.sh ${prefix_path} + prefix_path="/pnetcdf_install" + destdir_path=${GITHUB_WORKSPACE}/inst + echo "---- test make install prefix=${prefix_path} DESTDIR=${destdir_path}" + make -s LIBTOOLFLAGS=--silent V=1 install prefix=${prefix_path} DESTDIR=${destdir_path} + test/tst_install.sh ${prefix_path} ${destdir_path} + + - name: Cleanup + if: ${{ always() }} + run: | + cd ${GITHUB_WORKSPACE} + make -s LIBTOOLFLAGS=--silent V=1 distclean diff --git a/.github/workflows/netcdf4_adios.yml b/.github/workflows/netcdf4_adios.yml new file mode 100644 index 0000000000..5090aa07d7 --- /dev/null +++ b/.github/workflows/netcdf4_adios.yml @@ -0,0 +1,236 @@ +# Test NetCDF4 and ADIOS +name: CI - NetCDF4 and ADIOS + +permissions: + contents: read + +on: + push: + branches: [ master ] + paths-ignore: + - '**/*.md' + - '**/*.txt' + - '**/*.1' + - '**/*.jpg' + - '**/*.png' + - 'docs/*' + - 'test/test_installed/*' + pull_request: + branches: [ master ] + paths-ignore: + - '**/*.md' + - '**/*.txt' + - '**/*.1' + - '**/*.jpg' + - '**/*.png' + - 'docs/*' + - 'test/test_installed/*' + +env: + LIBTOOL_VERSION: 2.5.4 + MPICH_LATEST: 4.3.2 + HDF5_VERSION: 1.14.6 + NETCDF4_VERSION: 4.9.3 + ADIOS_VERSION: 1.13.1 + +jobs: + build: + strategy: + fail-fast: false # This disables the default cancel-on-failure behavior + + runs-on: ubuntu-latest + timeout-minutes: 120 + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v4 + + - name: Install autotools + run: | + set -x + sudo apt-get update + sudo apt-get install autoconf + sudo apt-get install automake + sudo apt-get install m4 + # sudo apt-get install libtool libtool-bin + wget -q https://ftp.gnu.org/gnu/libtool/libtool-${LIBTOOL_VERSION}.tar.gz + gzip -dc libtool-${LIBTOOL_VERSION}.tar.gz | tar -xf - + cd libtool-${LIBTOOL_VERSION} + ./configure --prefix=/usr --silent + sudo make -s LIBTOOLFLAGS=--silent V=1 -j 8 install > qout 2>&1 + sudo make -s LIBTOOLFLAGS=--silent V=1 -j 8 distclean >> qout 2>&1 + which autoconf + autoconf --version + which automake + automake --version + which libtool + libtool --version + which m4 + m4 --version + + - name: Build and install MPICH + run: | + # MPICH versions older than 4.2.2 do not support the MPI large + # count feature. + echo "Install MPICH ${MPICH_LATEST} in ${GITHUB_WORKSPACE}/MPI" + wget -q https://www.mpich.org/static/downloads/${MPICH_LATEST}/mpich-${MPICH_LATEST}.tar.gz + gzip -dc mpich-${MPICH_LATEST}.tar.gz | tar -xf - + cd mpich-${MPICH_LATEST} + ./configure --prefix=${GITHUB_WORKSPACE}/MPICH \ + --silent \ + --enable-romio \ + --with-file-system=ufs \ + --with-device=ch3:sock \ + --enable-fortran \ + CC=gcc FC=gfortran \ + FFLAGS=-fallow-argument-mismatch \ + FCFLAGS=-fallow-argument-mismatch + make -s LIBTOOLFLAGS=--silent V=1 -j 8 install > qout 2>&1 + make -s LIBTOOLFLAGS=--silent V=1 -j 8 distclean >> qout 2>&1 + + - name: Build and install HDF5 + if: ${{ success() }} + run: | + set -x + cd ${GITHUB_WORKSPACE} + rm -rf HDF5 ; mkdir HDF5 ; cd HDF5 + curl -LO https://github.com/HDFGroup/hdf5/releases/download/hdf5_${HDF5_VERSION}/hdf5-${HDF5_VERSION}.tar.gz + tar -zxf hdf5-${HDF5_VERSION}.tar.gz + cd hdf5-${HDF5_VERSION} + ./configure --prefix=${GITHUB_WORKSPACE}/HDF5 \ + --silent \ + --enable-hl \ + --enable-parallel \ + --enable-build-mode=production \ + --disable-doxygen-doc \ + --disable-doxygen-man \ + --disable-doxygen-html \ + --disable-tools \ + --disable-tests \ + --disable-fortran \ + --disable-cxx \ + CC=${GITHUB_WORKSPACE}/MPICH/bin/mpicc + make -s LIBTOOLFLAGS=--silent V=1 -j 8 install > qout 2>&1 + make -s distclean >> qout 2>&1 + + - name: Build and install NetCDF4 + if: ${{ success() }} + run: | + set -x + cd ${GITHUB_WORKSPACE} + rm -rf NetCDF ; mkdir NetCDF ; cd NetCDF + curl -LO https://github.com/Unidata/netcdf-c/archive/refs/tags/v${NETCDF4_VERSION}.tar.gz + tar -zxf v${NETCDF4_VERSION}.tar.gz + cd netcdf-c-${NETCDF4_VERSION} + ./configure --prefix=${GITHUB_WORKSPACE}/NetCDF \ + --silent \ + --disable-doxygen \ + --disable-mmap \ + --disable-dap \ + --disable-nczarr \ + --disable-nczarr-filters \ + --disable-filter-testing \ + --disable-quantize \ + --disable-byterange \ + CC=${GITHUB_WORKSPACE}/MPICH/bin/mpicc \ + CPPFLAGS="-I${GITHUB_WORKSPACE}/HDF5/include" \ + LDFLAGS="-L${GITHUB_WORKSPACE}/HDF5/lib" \ + LIBS="-lhdf5" + make -s LIBTOOLFLAGS=--silent V=1 -j 8 install > qout 2>&1 + make -s distclean >> qout 2>&1 + + - name: Build and install ADIOS + if: ${{ success() }} + run: | + cd ${GITHUB_WORKSPACE} + export PATH="${GITHUB_WORKSPACE}/MPICH/bin:${PATH}" + wget -q https://users.nccs.gov/~pnorbert/adios-${ADIOS_VERSION}.tar.gz + gzip -dc adios-${ADIOS_VERSION}.tar.gz | tar -xf - + cd adios-${ADIOS_VERSION} + mkdir build && cd build + ../configure --prefix=${GITHUB_WORKSPACE}/ADIOS \ + --silent \ + --with-mpi=${GITHUB_WORKSPACE}/MPICH \ + --disable-fortran + make -j 8 >> qout 2>&1 + make -j 8 install >> qout 2>&1 + + - name: Build PnetCDF + if: ${{ success() }} + run: | + set -x + cd ${GITHUB_WORKSPACE} + which autoconf + autoconf --version + which automake + automake --version + which libtool + libtool --version + which m4 + m4 --version + autoreconf -i + mkdir -p ${GITHUB_WORKSPACE}/pnetcdf_output + ./configure --prefix=${GITHUB_WORKSPACE}/PnetCDF \ + --enable-option-checking=fatal \ + pnc_ac_debug=yes \ + --with-netcdf4=${GITHUB_WORKSPACE}/NetCDF \ + --with-adios=${GITHUB_WORKSPACE}/ADIOS \ + --with-mpi=${GITHUB_WORKSPACE}/MPICH \ + TESTOUTDIR=${GITHUB_WORKSPACE}/pnetcdf_output + make -s LIBTOOLFLAGS=--silent V=1 -j 8 tests + + - name: Print config.log + if: ${{ always() }} + run: | + cat ${GITHUB_WORKSPACE}/config.log + + - name: make check + if: ${{ success() }} + run: | + cd ${GITHUB_WORKSPACE} + make -s LIBTOOLFLAGS=--silent V=1 check + + - name: Print test log files + if: ${{ always() }} + run: | + cd ${GITHUB_WORKSPACE} + fname=`find src test examples benchmarks -type f -name "*.log"` + for f in $fname ; do \ + bname=`basename $f` ; \ + if test "x$bname" != xconfig.log ; then \ + echo "-------- dump $f ----------------------------" ; \ + cat $f ; \ + fi ; \ + done + + - name: make ptests + if: ${{ success() }} + run: | + cd ${GITHUB_WORKSPACE} + make -s LIBTOOLFLAGS=--silent V=1 ptests + + - name: make distcheck + if: ${{ success() }} + run: | + cd ${GITHUB_WORKSPACE} + make -j 8 distcheck DISTCHECK_CONFIGURE_FLAGS="--silent --with-mpi=${GITHUB_WORKSPACE}/MPICH" + + - name: make install + if: ${{ success() }} + run: | + set -x + cd ${GITHUB_WORKSPACE} + prefix_path=${GITHUB_WORKSPACE}/pnetcdf_install + echo "---- test make install prefix=${prefix_path}" + make -s LIBTOOLFLAGS=--silent V=1 install prefix=${prefix_path} + test/tst_install.sh ${prefix_path} + prefix_path="/pnetcdf_install" + destdir_path=${GITHUB_WORKSPACE}/inst + echo "---- test make install prefix=${prefix_path} DESTDIR=${destdir_path}" + make -s LIBTOOLFLAGS=--silent V=1 install prefix=${prefix_path} DESTDIR=${destdir_path} + test/tst_install.sh ${prefix_path} ${destdir_path} + + - name: Cleanup + if: ${{ always() }} + run: | + cd ${GITHUB_WORKSPACE} + make -s LIBTOOLFLAGS=--silent V=1 distclean diff --git a/.github/workflows/ubuntu_mpich.yml b/.github/workflows/ubuntu_mpich.yml deleted file mode 100644 index 09d626f8f1..0000000000 --- a/.github/workflows/ubuntu_mpich.yml +++ /dev/null @@ -1,215 +0,0 @@ -name: ubuntu_mpich - -on: - push: - branches: master - paths-ignore: - - '**/*.md' - - '**/*.txt' - - '**/*.1' - - 'docs/*' - - 'test/test_installed/*' - pull_request: - branches: master - paths-ignore: - - '**/*.md' - - '**/*.txt' - - '**/*.1' - - 'docs/*' - - 'test/test_installed/*' - -env: - MPICH_VERSION: 4.3.0 - AUTOCONF_VERSION: 2.71 - AUTOMAKE_VERSION: 1.17 - LIBTOOL_VERSION: 2.5.4 - M4_VERSION: 1.4.19 - -jobs: - build: - runs-on: ubuntu-latest - timeout-minutes: 60 - steps: - - uses: actions/checkout@v4 - - name: Set up dependencies - run: | - sudo apt-get update - # install gfortran - version=12 - sudo add-apt-repository ppa:ubuntu-toolchain-r/test - sudo apt-get update - sudo apt-get install -y gcc-${version} gfortran-${version} - sudo update-alternatives \ - --install /usr/bin/gcc gcc /usr/bin/gcc-${version} 100 \ - --slave /usr/bin/gfortran gfortran /usr/bin/gfortran-${version} \ - --slave /usr/bin/gcov gcov /usr/bin/gcov-${version} - echo "---- gcc/gfortran version ------------------------------" - which gcc - which gfortran - gcc --version - gfortran --version - - name: Clean up git untracked files - run: | - git clean -fx - - name: Build GNU autotools - run: | - export PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/bin:${PATH}" - export LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/lib:${LD_LIBRARY_PATH}" - cd ${GITHUB_WORKSPACE} - wget -q https://ftp.gnu.org/gnu/m4/m4-${M4_VERSION}.tar.gz - gzip -dc m4-${M4_VERSION}.tar.gz | tar -xf - - cd m4-${M4_VERSION} - ./configure --prefix=${GITHUB_WORKSPACE}/AUTOTOOLS \ - --silent - make -s -j 8 install > qout 2>&1 - make -s -j 8 distclean >> qout 2>&1 - cd ${GITHUB_WORKSPACE} - wget -q https://ftp.gnu.org/gnu/autoconf/autoconf-${AUTOCONF_VERSION}.tar.gz - gzip -dc autoconf-${AUTOCONF_VERSION}.tar.gz | tar -xf - - cd autoconf-${AUTOCONF_VERSION} - ./configure --prefix=${GITHUB_WORKSPACE}/AUTOTOOLS \ - --silent - make -s -j 8 install > qout 2>&1 - make -s -j 8 distclean >> qout 2>&1 - cd ${GITHUB_WORKSPACE} - wget -q https://ftp.gnu.org/gnu/automake/automake-${AUTOMAKE_VERSION}.tar.gz - gzip -dc automake-${AUTOMAKE_VERSION}.tar.gz | tar -xf - - cd automake-${AUTOMAKE_VERSION} - ./configure --prefix=${GITHUB_WORKSPACE}/AUTOTOOLS \ - --silent - make -s -j 8 install > qout 2>&1 - make -s -j 8 distclean >> qout 2>&1 - cd ${GITHUB_WORKSPACE} - wget -q https://ftp.gnu.org/gnu/libtool/libtool-${LIBTOOL_VERSION}.tar.gz - gzip -dc libtool-${LIBTOOL_VERSION}.tar.gz | tar -xf - - cd libtool-${LIBTOOL_VERSION} - ./configure --prefix=${GITHUB_WORKSPACE}/AUTOTOOLS \ - --silent - make -s -j 8 install > qout 2>&1 - make -s -j 8 distclean >> qout 2>&1 - - name: Build MPICH - run: | - cd ${GITHUB_WORKSPACE} - echo "Install MPICH ${MPICH_VERSION} in ${GITHUB_WORKSPACE}/MPICH" - rm -rf MPICH ; mkdir MPICH ; cd MPICH - # git clone -q https://github.com/pmodels/mpich.git - # cd mpich - # git submodule update --init - # ./autogen.sh - wget -q https://www.mpich.org/static/downloads/${MPICH_VERSION}/mpich-${MPICH_VERSION}.tar.gz - gzip -dc mpich-${MPICH_VERSION}.tar.gz | tar -xf - - cd mpich-${MPICH_VERSION} - ./configure --prefix=${GITHUB_WORKSPACE}/MPICH \ - --silent \ - --enable-romio \ - --with-file-system=ufs \ - --with-device=ch3:sock \ - --enable-fortran \ - CC=gcc FC=gfortran \ - FFLAGS=-fallow-argument-mismatch \ - FCFLAGS=-fallow-argument-mismatch - make -s LIBTOOLFLAGS=--silent V=1 -j 4 install > qout 2>&1 - make -s -j 4 distclean >> qout 2>&1 - - name: Build PnetCDF - run: | - cd ${GITHUB_WORKSPACE} - export PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/bin:${PATH}" - export LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/lib:${LD_LIBRARY_PATH}" - m4 --version - autoconf --version - automake --version - libtool --version - autoreconf -i - ./configure --prefix=${GITHUB_WORKSPACE}/PnetCDF \ - --enable-option-checking=fatal \ - --enable-profiling \ - pnc_ac_debug=yes \ - --enable-burst_buffering \ - --enable-subfiling \ - --enable-thread-safe \ - --with-pthread \ - --with-mpi=${GITHUB_WORKSPACE}/MPICH - make -j 8 tests - - name: Print config.log - if: ${{ always() }} - run: | - cat ${GITHUB_WORKSPACE}/config.log - - name: make check - run: | - cd ${GITHUB_WORKSPACE} - make check - - name: Print test log files - if: ${{ always() }} - run: | - cd ${GITHUB_WORKSPACE} - fname=`find src test examples benchmarks -type f -name "*.log"` - for f in $fname ; do \ - bname=`basename $f` ; \ - if test "x$bname" != xconfig.log ; then \ - echo "-------- dump $f ----------------------------" ; \ - cat $f ; \ - fi ; \ - done - - name: make ptests - run: | - cd ${GITHUB_WORKSPACE} - make ptests - - name: Build PnetCDF (default configuration) - run: | - cd ${GITHUB_WORKSPACE} - export PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/bin:${PATH}" - export LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/lib:${LD_LIBRARY_PATH}" - make distclean - ./configure --prefix=${GITHUB_WORKSPACE}/PnetCDF \ - --with-mpi=${GITHUB_WORKSPACE}/MPICH - make -j 8 tests - - name: Print config.log (default configuration) - if: ${{ always() }} - run: | - cat ${GITHUB_WORKSPACE}/config.log - - name: make check (default configuration) - run: | - cd ${GITHUB_WORKSPACE} - make check - - name: Print test log files (default configuration) - if: ${{ always() }} - run: | - cd ${GITHUB_WORKSPACE} - fname=`find src test examples benchmarks -type f -name "*.log"` - for f in $fname ; do \ - bname=`basename $f` ; \ - if test "x$bname" != xconfig.log ; then \ - echo "-------- dump $f ----------------------------" ; \ - cat $f ; \ - fi ; \ - done - - name: make ptests (default configuration) - run: | - cd ${GITHUB_WORKSPACE} - make ptests - - name: make distcheck - run: | - cd ${GITHUB_WORKSPACE} - make -j 8 distcheck DISTCHECK_CONFIGURE_FLAGS="--silent --with-mpi=${GITHUB_WORKSPACE}/MPICH" - - name: make install - run: | - cd ${GITHUB_WORKSPACE} - prefix_path=${GITHUB_WORKSPACE}/pnetcdf_install - echo "---- test make install prefix=${prefix_path}" - make install prefix=${prefix_path} - test/tst_install.sh ${prefix_path} - prefix_path="/pnetcdf_install" - destdir_path=${GITHUB_WORKSPACE}/inst - echo "---- test make install prefix=${prefix_path} DESTDIR=${destdir_path}" - make install prefix=${prefix_path} DESTDIR=${destdir_path} - test/tst_install.sh ${prefix_path} ${destdir_path} - - name: Cleanup - if: ${{ always() }} - run: | - cd ${GITHUB_WORKSPACE} - make -s distclean - rm -rf ${GITHUB_WORKSPACE}/pnetcdf_output - rm -rf ${GITHUB_WORKSPACE}/MPICH - rm -rf ${GITHUB_WORKSPACE}/pnetcdf_install - rm -rf ${GITHUB_WORKSPACE}/inst - diff --git a/.github/workflows/ubuntu_openmpi.yml b/.github/workflows/ubuntu_openmpi.yml deleted file mode 100644 index 80f087295e..0000000000 --- a/.github/workflows/ubuntu_openmpi.yml +++ /dev/null @@ -1,213 +0,0 @@ -name: ubuntu_openmpi - -on: - push: - branches: master - paths-ignore: - - '**/*.md' - - '**/*.txt' - - '**/*.1' - - 'docs/*' - - 'test/test_installed/*' - pull_request: - branches: master - paths-ignore: - - '**/*.md' - - '**/*.txt' - - '**/*.1' - - 'docs/*' - - 'test/test_installed/*' - -env: - OPENMPI_VERSION: 5.0.2 - AUTOCONF_VERSION: 2.71 - AUTOMAKE_VERSION: 1.17 - LIBTOOL_VERSION: 2.5.4 - M4_VERSION: 1.4.19 - -jobs: - build: - runs-on: ubuntu-latest - timeout-minutes: 90 - steps: - - uses: actions/checkout@v4 - - name: Set up dependencies - run: | - sudo apt-get update - # install gfortran - version=12 - sudo add-apt-repository ppa:ubuntu-toolchain-r/test - sudo apt-get update - sudo apt-get install -y gcc-${version} gfortran-${version} - sudo update-alternatives \ - --install /usr/bin/gcc gcc /usr/bin/gcc-${version} 100 \ - --slave /usr/bin/gfortran gfortran /usr/bin/gfortran-${version} \ - --slave /usr/bin/gcov gcov /usr/bin/gcov-${version} - echo "---- gcc/gfortran version ------------------------------" - which gcc - which gfortran - gcc --version - gfortran --version - - name: Clean up git untracked files - run: | - git clean -fx - - name: Build GNU autotools - run: | - export PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/bin:${PATH}" - export LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/lib:${LD_LIBRARY_PATH}" - cd ${GITHUB_WORKSPACE} - wget -q https://ftp.gnu.org/gnu/m4/m4-${M4_VERSION}.tar.gz - gzip -dc m4-${M4_VERSION}.tar.gz | tar -xf - - cd m4-${M4_VERSION} - ./configure --prefix=${GITHUB_WORKSPACE}/AUTOTOOLS \ - --silent - make -s -j 8 install > qout 2>&1 - make -s -j 8 distclean >> qout 2>&1 - cd ${GITHUB_WORKSPACE} - wget -q https://ftp.gnu.org/gnu/autoconf/autoconf-${AUTOCONF_VERSION}.tar.gz - gzip -dc autoconf-${AUTOCONF_VERSION}.tar.gz | tar -xf - - cd autoconf-${AUTOCONF_VERSION} - ./configure --prefix=${GITHUB_WORKSPACE}/AUTOTOOLS \ - --silent - make -s -j 8 install > qout 2>&1 - make -s -j 8 distclean >> qout 2>&1 - cd ${GITHUB_WORKSPACE} - wget -q https://ftp.gnu.org/gnu/automake/automake-${AUTOMAKE_VERSION}.tar.gz - gzip -dc automake-${AUTOMAKE_VERSION}.tar.gz | tar -xf - - cd automake-${AUTOMAKE_VERSION} - ./configure --prefix=${GITHUB_WORKSPACE}/AUTOTOOLS \ - --silent - make -s -j 8 install > qout 2>&1 - make -s -j 8 distclean >> qout 2>&1 - cd ${GITHUB_WORKSPACE} - wget -q https://ftp.gnu.org/gnu/libtool/libtool-${LIBTOOL_VERSION}.tar.gz - gzip -dc libtool-${LIBTOOL_VERSION}.tar.gz | tar -xf - - cd libtool-${LIBTOOL_VERSION} - ./configure --prefix=${GITHUB_WORKSPACE}/AUTOTOOLS \ - --silent - make -s -j 8 install > qout 2>&1 - make -s -j 8 distclean >> qout 2>&1 - - name: Build OPENMPI - run: | - cd ${GITHUB_WORKSPACE} - echo "Install OPENMPI ${OPENMPI_VERSION} in ${GITHUB_WORKSPACE}/OPENMPI" - rm -rf OPENMPI ; mkdir OPENMPI ; cd OPENMPI - VER_MAJOR=${OPENMPI_VERSION%.*} - wget -q https://download.open-mpi.org/release/open-mpi/v${VER_MAJOR}/openmpi-${OPENMPI_VERSION}.tar.gz - gzip -dc openmpi-${OPENMPI_VERSION}.tar.gz | tar -xf - - cd openmpi-${OPENMPI_VERSION} - ./configure --prefix=${GITHUB_WORKSPACE}/OPENMPI \ - --silent \ - --with-io-romio-flags="--with-file-system=ufs" \ - CC=gcc \ - FC=gfortran \ - FCFLAGS=-fallow-argument-mismatch - make -s LIBTOOLFLAGS=--silent V=1 -j 4 install > qout 2>&1 - make -s -j 4 distclean >> qout 2>&1 - - name: Build PnetCDF - run: | - cd ${GITHUB_WORKSPACE} - export PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/bin:${PATH}" - export LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/lib:${LD_LIBRARY_PATH}" - m4 --version - autoconf --version - automake --version - libtool --version - autoreconf -i - mkdir -p pnetcdf_output - ./configure --prefix=${GITHUB_WORKSPACE}/PnetCDF \ - --enable-option-checking=fatal \ - --enable-profiling \ - pnc_ac_debug=yes \ - --enable-burst_buffering \ - --enable-subfiling \ - --enable-thread-safe \ - --with-pthread \ - --with-mpi=${GITHUB_WORKSPACE}/OPENMPI \ - TESTOUTDIR=${GITHUB_WORKSPACE}/pnetcdf_output - make -j 8 tests - - name: Print config.log - if: ${{ always() }} - run: | - cat ${GITHUB_WORKSPACE}/config.log - - name: make check - run: | - cd ${GITHUB_WORKSPACE} - make check - - name: Print test log files - if: ${{ always() }} - run: | - cd ${GITHUB_WORKSPACE} - fname=`find src test examples benchmarks -type f -name "*.log"` - for f in $fname ; do \ - bname=`basename $f` ; \ - if test "x$bname" != xconfig.log ; then \ - echo "-------- dump $f ----------------------------" ; \ - cat $f ; \ - fi ; \ - done - - name: make ptests - run: | - cd ${GITHUB_WORKSPACE} - make ptests - - name: Build PnetCDF (default configuration) - run: | - cd ${GITHUB_WORKSPACE} - export PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/bin:${PATH}" - export LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/lib:${LD_LIBRARY_PATH}" - make distclean - mkdir -p pnetcdf_output - ./configure --prefix=${GITHUB_WORKSPACE}/PnetCDF \ - --with-mpi=${GITHUB_WORKSPACE}/OPENMPI \ - TESTOUTDIR=${GITHUB_WORKSPACE}/pnetcdf_output - make -j 8 tests - - name: Print config.log (default configuration) - if: ${{ always() }} - run: | - cat ${GITHUB_WORKSPACE}/config.log - - name: make check (default configuration) - run: | - cd ${GITHUB_WORKSPACE} - make check - - name: Print test log files (default configuration) - if: ${{ always() }} - run: | - cd ${GITHUB_WORKSPACE} - fname=`find src test examples benchmarks -type f -name "*.log"` - for f in $fname ; do \ - bname=`basename $f` ; \ - if test "x$bname" != xconfig.log ; then \ - echo "-------- dump $f ----------------------------" ; \ - cat $f ; \ - fi ; \ - done - - name: make ptests (default configuration) - run: | - cd ${GITHUB_WORKSPACE} - make ptests - - name: make distcheck - run: | - cd ${GITHUB_WORKSPACE} - make -j 8 distcheck DISTCHECK_CONFIGURE_FLAGS="--silent --with-mpi=${GITHUB_WORKSPACE}/OPENMPI" - - name: make install - run: | - cd ${GITHUB_WORKSPACE} - prefix_path=${GITHUB_WORKSPACE}/pnetcdf_install - echo "---- test make install prefix=${prefix_path}" - make install prefix=${prefix_path} - test/tst_install.sh ${prefix_path} - prefix_path="/pnetcdf_install" - destdir_path=${GITHUB_WORKSPACE}/inst - echo "---- test make install prefix=${prefix_path} DESTDIR=${destdir_path}" - make install prefix=${prefix_path} DESTDIR=${destdir_path} - test/tst_install.sh ${prefix_path} ${destdir_path} - - name: Cleanup - if: ${{ always() }} - run: | - cd ${GITHUB_WORKSPACE} - make -s distclean - rm -rf ${GITHUB_WORKSPACE}/pnetcdf_output - rm -rf ${GITHUB_WORKSPACE}/OPENMPI - rm -rf ${GITHUB_WORKSPACE}/pnetcdf_install - rm -rf ${GITHUB_WORKSPACE}/inst - diff --git a/DEVELOPER_NOTES.md b/DEVELOPER_NOTES.md index f1b3693a6c..0ca204b832 100644 --- a/DEVELOPER_NOTES.md +++ b/DEVELOPER_NOTES.md @@ -149,12 +149,17 @@ 10. Generate SHA1 checksums * Run command: ``` - openssl sha1 pnetcdf-1.11.0.tar.gz` + openssl sha1 pnetcdf-1.11.0.tar.gz ``` * Example command-line output: ``` SHA1(pnetcdf-1.11.0.tar.gz)= 495d42f0a41abbd09d276262dce0f7c1c535968a ``` + * Or use SHA 256 + ``` + sha256sum pnetcdf-1.11.0.tar.gz + a18a1a43e6c4fd7ef5827dbe90e9dcf1363b758f513af1f1356ed6c651195a9f pnetcdf-1.11.0.tar.gz + ``` 11. Update PnetCDF Web Page * https://github.com/Parallel-NetCDF/Parallel-NetCDF.github.io * Create a new file of release note Parallel-NetCDF.github.io/Release_notes/1.11.0.md. diff --git a/Makefile.am b/Makefile.am index 8e116c08ab..6054407c44 100644 --- a/Makefile.am +++ b/Makefile.am @@ -21,6 +21,8 @@ EXTRA_DIST = COPYRIGHT \ README \ RELEASE_NOTES \ m4/foreach.m4 \ + m4/foreach_idx.m4 \ + m4/list_len.m4 \ m4/utils.m4 # Below is a trick to build all test executables, without running them diff --git a/README.md b/README.md index bd76d61e78..c1a49c10cb 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,5 @@ ## PnetCDF source code development repository -[![MPICH](https://github.com/Parallel-NetCDF/PnetCDF/actions/workflows/ubuntu_mpich.yml/badge.svg)](https://github.com/Parallel-NetCDF/PnetCDF/actions/workflows/ubuntu_mpich.yml) -[![OpenMPI](https://github.com/Parallel-NetCDF/PnetCDF/actions/workflows/ubuntu_openmpi.yml/badge.svg)](https://github.com/Parallel-NetCDF/PnetCDF/actions/workflows/ubuntu_openmpi.yml) - +[![CI - OS and MPI](https://github.com/Parallel-NetCDF/PnetCDF/actions/workflows/main.yml/badge.svg)](https://github.com/Parallel-NetCDF/PnetCDF/actions/workflows/main.yml) PnetCDF is a parallel I/O library for accessing [Unidata's NetCDF](http://www.unidata.ucar.edu/software/netcdf) files in @@ -17,6 +15,7 @@ Northwestern University and Argonne National Laboratory. contains more information about PnetCDF. ### PnetCDF official software releases +* The [alpha release of 1.15.0](https://parallel-netcdf.github.io/Release/pnetcdf-1.15.0-alpha.tar.gz) is available on October 27, 2025. * The latest stable release is [pnetcdf-1.14.1.tar.gz](https://parallel-netcdf.github.io/Release/pnetcdf-1.14.1.tar.gz) ([release note](https://github.com/Parallel-NetCDF/Parallel-NetCDF.github.io/blob/master/Release_notes/1.14.1.md)), diff --git a/RELEASE_NOTES b/RELEASE_NOTES index c0e031f48c..ae041c3694 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -6,6 +6,44 @@ PnetCDF Release Notes Version _PNETCDF_VERSION_ (_PNETCDF_RELEASE_DATE_) ------------------------------------- +* New feature + + Intra-node aggregation for read requests is added. This is the complement + of the write requests first implemented in version 1.14.0. Now intra-node + aggregation supports both write and read operations. This feature can be + enabled by setting hint "nc_num_aggrs_per_node" to the desired number of + aggregators per compute node. + +* New optimization + + A new internal I/O driver, named "pncio", is added which implements several + strategies for performance improvement. A significant portion of this + driver was developed to improve performance when Lustre is used. It + includes the followings. + * When creating a new file, it try to set the Lustre file striping count + to the number of compute nodes allocated to the MPI communicator passed + to "ncmpi_create()", when I/O hint "striping_factor" is not explicitly + set by the applications. + * It automatically sets a good value for hint "cb_nodes" when it is not + explicitly set by the applications. + +* API deprecated + + "vard" APIs introduced in version 1.6.0 are now deprecated. These are the + API family that take an argument of MPI derived data type describing the + file access layout, which is used as the fileview by the underlying MPI + library. + +* New error code + + "NC_EFSTYPE" indicates an error when an invalid file system type is + detected. + +* New PnetCDF hint + + "nc_pncio" -- To disable or enable the use of the internal "pncio" driver. + Its string value is either "enable" or "disable". The default is "enable". + + +------------------------------------- +Version 1.14.1 (July 31, 2025) +------------------------------------- + * New optimization + When file header extent size grows, moving the data section to a higher file offset has changed to be done in chunks of 16 MB per process. @@ -68,27 +106,28 @@ Version _PNETCDF_VERSION_ (_PNETCDF_RELEASE_DATE_) + There are three ways in PnetCDF for user to set hints to align the starting file offset for the data section (header extent) and record variable section. - 1. through a call to API `nc_header_align_size` by setting arguments of - `h_minfree`, `v_align`, `v_minfree`, and `r_align`. - 2. through an MPI info object passed to calls of `ncmpi_create()` and - `ncmpi_open()`. Hints are `nc_header_align_size`, `nc_var_align_size`, - and `nc_record_align_size`. - 3. through a run-time environment variable `PNETCDF_HINTS`. Hints are - `nc_header_align_size`, `nc_var_align_size`, and `nc_record_align_size`. + 1. through a call to API `ncmpi__enddef` by setting arguments `h_minfree`, + `v_align`, `v_minfree`, and `r_align`. + 2. through passing an MPI info object to a call of `ncmpi_create()` and + `ncmpi_open()`. Hints include `nc_header_align_size`, + `nc_var_align_size`, and `nc_record_align_size`. + 3. through setting hints in an environment variable `PNETCDF_HINTS` at the + run time. Hints include `nc_header_align_size`, `nc_var_align_size`, and + `nc_record_align_size`. + As the same hints may be set by one or more of the above methods, PnetCDF implements the following hint precedence. * `PNETCDF_HINTS` > `ncmpi__enddef()` > `MPI info`. - * 1st priority: hints set in the environment variable `PNETCDF_HINTS`, e.g. - `PNETCDF_HINTS="nc_var_align_size=1048576"`. Making this the first - priority is because it allows to run the same application executable - without source code modification using different alignment settings - through a run-time environment variable. - * 2nd priority: hints set in the MPI info object passed to calls of - `ncmpi_create()` and `ncmpi_open()`, e.g. - `MPI_Info_set("nc_var_align_size", "1048576");`. The reasoning is when a - 3rd-party library built on top of PnetCDF implements its codes using - 'ncmpi__enddef'. An application that uses such 3rd-party library can pass - an MPI info object to it, which further passes the info to PnetCDF. This + * 1st priority: hints set in the run-time environment variable + `PNETCDF_HINTS`, e.g. `PNETCDF_HINTS="nc_var_align_size=1048576"`. Making + this the first priority is because it allows the same application + executable without source code modification to run using different + alignment settings in the run-time environment variable. + * 2nd priority: hints set in the MPI info object, e.g. + `MPI_Info_set("nc_var_align_size", "1048576");`, passed to calls of + `ncmpi_create()` and `ncmpi_open()`. The reasoning is when a 3rd-party + library built on top of PnetCDF may call 'ncmpi__enddef' with its setting + of alignment values. An application that uses such 3rd-party library can + pass an MPI info object to it, which is then passes to PnetCDF. This precedence allows that application to exercise different hints without changing the 3rd-party library's source codes. * 3rd priority: hints used in the arguments of `ncmpi__enddef()`, e.g. diff --git a/benchmarks/C/Makefile.am b/benchmarks/C/Makefile.am index 333176cbf5..5879074d60 100644 --- a/benchmarks/C/Makefile.am +++ b/benchmarks/C/Makefile.am @@ -24,11 +24,24 @@ TESTS_ENVIRONMENT += export srcdir="$(srcdir)"; TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; TESTS_ENVIRONMENT += export TESTMPIRUN="$(TESTMPIRUN)"; -TESTS_ENVIRONMENT += export PNETCDF_DEBUG="$(PNETCDF_DEBUG)"; TESTS_ENVIRONMENT += export TESTPROGRAMS="$(TESTPROGRAMS)"; TESTS_ENVIRONMENT += export check_PROGRAMS="$(check_PROGRAMS)"; -TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER="$(ENABLE_BURST_BUFFER)"; -TESTS_ENVIRONMENT += export ENABLE_NETCDF4="$(ENABLE_NETCDF4)"; + +if PNETCDF_DEBUG + TESTS_ENVIRONMENT += export PNETCDF_DEBUG=1; +endif +if ENABLE_BURST_BUFFER + TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER=1; +endif +if ENABLE_NETCDF4 + TESTS_ENVIRONMENT += export ENABLE_NETCDF4=1; +endif +if ENABLE_THREAD_SAFE + TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE=1; +endif +if MIMIC_LUSTRE + TESTS_ENVIRONMENT += export MIMIC_LUSTRE=1; +endif NC_FILES = $(check_PROGRAMS:%=$(TESTOUTDIR)/%.nc) \ $(check_PROGRAMS:%=$(TESTOUTDIR)/%.bb.nc) @@ -40,14 +53,42 @@ CLEANFILES = core core.* *.gcda *.gcno *.gcov gmon.out \ # be used to compare NetCDF4 performance against PnetCDF. EXTRA_DIST = parallel_run.sh netcdf_put_vara.c -ptest ptests ptest4: $(check_PROGRAMS) +ptest ptest4: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 4 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ $(srcdir)/parallel_run.sh 4 || exit 1 -ptest2 ptest6 ptest8 ptest10: +ptest2: $(check_PROGRAMS) + @echo "===========================================================" + @echo " $(subdir): Parallel testing on 2 MPI processes" + @echo "===========================================================" + @$(TESTS_ENVIRONMENT) \ + $(srcdir)/parallel_run.sh 2 || exit 1 + +ptest6: $(check_PROGRAMS) + @echo "===========================================================" + @echo " $(subdir): Parallel testing on 6 MPI processes" + @echo "===========================================================" + @$(TESTS_ENVIRONMENT) \ + $(srcdir)/parallel_run.sh 6 || exit 1 + +ptest8: $(check_PROGRAMS) + @echo "===========================================================" + @echo " $(subdir): Parallel testing on 8 MPI processes" + @echo "===========================================================" + @$(TESTS_ENVIRONMENT) \ + $(srcdir)/parallel_run.sh 8 || exit 1 + +ptest10: $(check_PROGRAMS) + @echo "===========================================================" + @echo " $(subdir): Parallel testing on 10 MPI processes" + @echo "===========================================================" + @$(TESTS_ENVIRONMENT) \ + $(srcdir)/parallel_run.sh 10 || exit 1 + +ptests: ptest4 ptest6 # build check targets but not invoke tests-local: all $(check_PROGRAMS) diff --git a/benchmarks/C/aggregation.c b/benchmarks/C/aggregation.c index 3f2f3afccc..c834d562f4 100644 --- a/benchmarks/C/aggregation.c +++ b/benchmarks/C/aggregation.c @@ -122,6 +122,7 @@ typedef struct { int star_block; int blocking_io; int double_xtype; + int indep_io; MPI_Offset len; MPI_Offset w_size; MPI_Offset r_size; @@ -287,6 +288,11 @@ int benchmark_write(char *filename, sts = (int*) malloc(sizeof(int) * num_reqs); err = ncmpi_enddef(ncid); ERR(err) + if (cfg->indep_io) { + err = ncmpi_begin_indep_data(ncid); + ERR(err) + } + err = ncmpi_inq_header_size(ncid, &cfg->header_size); ERR(err) err = ncmpi_inq_header_extent(ncid, &cfg->header_extent); ERR(err) end_t = MPI_Wtime(); @@ -306,12 +312,17 @@ int benchmark_write(char *filename, start[1] = cfg->len * (rank / psizes[1]); count[1] = cfg->len; count[2] = cfg->len; - if (cfg->blocking_io) - err = ncmpi_put_vara_double_all(ncid, varid[v], start, + if (cfg->blocking_io) { + if (cfg->indep_io) + err = ncmpi_put_vara_double(ncid, varid[v], start, count, buf[v]); + else + err = ncmpi_put_vara_double_all(ncid, varid[v], start, + count, buf[v]); + } else err = ncmpi_iput_vara_double(ncid, varid[v], start, count, - buf[v], &reqs[k++]); + buf[v], &reqs[k++]); ERR(err) if (debug) DBG_PRINT("block-block", n, i); v++; @@ -323,9 +334,14 @@ int benchmark_write(char *filename, count[2] = cfg->len; stride[1] = 1; stride[2] = nprocs; - if (cfg->blocking_io) - err = ncmpi_put_vars_double_all(ncid, varid[v], start, + if (cfg->blocking_io) { + if (cfg->indep_io) + err = ncmpi_put_vars_double(ncid, varid[v], start, count, stride, buf[v]); + else + err = ncmpi_put_vars_double_all(ncid, varid[v], start, + count, stride, buf[v]); + } else err = ncmpi_iput_vars_double(ncid, varid[v], start, count, stride, buf[v], &reqs[k++]); @@ -338,9 +354,14 @@ int benchmark_write(char *filename, start[2] = 0; count[1] = cfg->len; count[2] = bs_gsizes[2]; - if (cfg->blocking_io) - err = ncmpi_put_vara_double_all(ncid, varid[v], start, + if (cfg->blocking_io) { + if (cfg->indep_io) + err = ncmpi_put_vara_double(ncid, varid[v], start, count, buf[v]); + else + err = ncmpi_put_vara_double_all(ncid, varid[v], start, + count, buf[v]); + } else err = ncmpi_iput_vara_double(ncid, varid[v], start, count, buf[v], &reqs[k++]); @@ -353,9 +374,14 @@ int benchmark_write(char *filename, start[2] = cfg->len * rank; count[1] = sb_gsizes[1]; count[2] = cfg->len; - if (cfg->blocking_io) - err = ncmpi_put_vara_double_all(ncid, varid[v], start, + if (cfg->blocking_io) { + if (cfg->indep_io) + err = ncmpi_put_vara_double(ncid, varid[v], start, count, buf[v]); + else + err = ncmpi_put_vara_double_all(ncid, varid[v], start, + count, buf[v]); + } else err = ncmpi_iput_vara_double(ncid, varid[v], start, count, buf[v], &reqs[k++]); @@ -372,13 +398,13 @@ int benchmark_write(char *filename, if (!cfg->blocking_io) { start_t = end_t; -#ifdef USE_INDEP_MODE - err = ncmpi_begin_indep_data(ncid); ERR(err) - err = ncmpi_wait(ncid, num_reqs, reqs, sts); ERR(err) - err = ncmpi_end_indep_data(ncid); ERR(err) -#else - err = ncmpi_wait_all(ncid, num_reqs, reqs, sts); ERR(err) -#endif + + if (cfg->indep_io) + err = ncmpi_wait(ncid, num_reqs, reqs, sts); + else + err = ncmpi_wait_all(ncid, num_reqs, reqs, sts); + ERR(err) + /* check status of all requests */ for (i=0; iindep_io) { + err = ncmpi_begin_indep_data(ncid); + ERR(err) + } + /* Note that PnetCDF read the file in chunks of size 256KB, thus the read * amount may be more than the file header size */ @@ -518,8 +549,12 @@ int benchmark_read(char *filename, start[1] = cfg->len * (rank / psizes[1]); count[1] = cfg->len; count[2] = cfg->len; - if (cfg->blocking_io) - err = ncmpi_get_vara_double_all(ncid, v, start, count, buf[v]); + if (cfg->blocking_io) { + if (cfg->indep_io) + err = ncmpi_get_vara_double(ncid, v, start, count, buf[v]); + else + err = ncmpi_get_vara_double_all(ncid, v, start, count, buf[v]); + } else err = ncmpi_iget_vara_double(ncid, v, start, count, buf[v], &reqs[k++]); @@ -533,9 +568,14 @@ int benchmark_read(char *filename, count[2] = cfg->len; stride[1] = 1; stride[2] = nprocs; - if (cfg->blocking_io) - err = ncmpi_get_vars_double_all(ncid, v, start, count, + if (cfg->blocking_io) { + if (cfg->indep_io) + err = ncmpi_get_vars_double(ncid, v, start, count, stride, buf[v]); + else + err = ncmpi_get_vars_double_all(ncid, v, start, count, + stride, buf[v]); + } else err = ncmpi_iget_vars_double(ncid, v, start, count, stride, buf[v], &reqs[k++]); @@ -547,8 +587,12 @@ int benchmark_read(char *filename, start[2] = 0; count[1] = cfg->len; count[2] = bs_gsizes[2]; - if (cfg->blocking_io) - err = ncmpi_get_vara_double_all(ncid, v, start, count, buf[v]); + if (cfg->blocking_io) { + if (cfg->indep_io) + err = ncmpi_get_vara_double(ncid, v, start, count, buf[v]); + else + err = ncmpi_get_vara_double_all(ncid, v, start, count, buf[v]); + } else err = ncmpi_iget_vara_double(ncid, v, start, count, buf[v], &reqs[k++]); @@ -560,8 +604,12 @@ int benchmark_read(char *filename, start[2] = cfg->len * rank; count[1] = sb_gsizes[1]; count[2] = cfg->len; - if (cfg->blocking_io) - err = ncmpi_get_vara_double_all(ncid, v, start, count, buf[v]); + if (cfg->blocking_io) { + if (cfg->indep_io) + err = ncmpi_get_vara_double(ncid, v, start, count, buf[v]); + else + err = ncmpi_get_vara_double_all(ncid, v, start, count, buf[v]); + } else err = ncmpi_iget_vara_double(ncid, v, start, count, buf[v], &reqs[k++]); @@ -578,13 +626,12 @@ int benchmark_read(char *filename, if (!cfg->blocking_io) { start_t = end_t; -#ifdef USE_INDEP_MODE - err = ncmpi_begin_indep_data(ncid); ERR(err) - err = ncmpi_wait(ncid, num_reqs, reqs, sts); ERR(err) - err = ncmpi_end_indep_data(ncid); ERR(err) -#else - err = ncmpi_wait_all(ncid, num_reqs, reqs, sts); ERR(err) -#endif + if (cfg->indep_io) + err = ncmpi_wait(ncid, num_reqs, reqs, sts); + else + err = ncmpi_wait_all(ncid, num_reqs, reqs, sts); + ERR(err) + /* check status of all requests */ for (i=0; i 0); diff --git a/benchmarks/C/write_block_read_column.c b/benchmarks/C/write_block_read_column.c index 5048783f4d..9824b99b8f 100644 --- a/benchmarks/C/write_block_read_column.c +++ b/benchmarks/C/write_block_read_column.c @@ -68,6 +68,7 @@ void print_info(MPI_Info *info_used) /*----< benchmark_write() >---------------------------------------------------*/ static int benchmark_write(char *filename, + int indep_io, MPI_Offset len, MPI_Offset *w_size, MPI_Info *w_info_used, @@ -157,6 +158,12 @@ int benchmark_write(char *filename, } err = ncmpi_enddef(ncid); ERR(err) + + if (indep_io) { + err = ncmpi_begin_indep_data(ncid); + ERR(err) + } + end_t = MPI_Wtime(); timing[2] = end_t - start_t; start_t = end_t; @@ -171,19 +178,31 @@ int benchmark_write(char *filename, for (i=0; i---------------------------------------------------*/ static int benchmark_read(char *filename, + int indep_io, MPI_Offset len, MPI_Offset *r_size, MPI_Info *r_info_used, @@ -241,6 +261,11 @@ int benchmark_read(char *filename, timing[1] = start_t - timing[0]; MPI_Info_free(&info); + if (indep_io) { + err = ncmpi_begin_indep_data(ncid); + ERR(err) + } + err = ncmpi_inq_nvars(ncid, &nvars); ERR(err) err = ncmpi_inq_dimid(ncid, "Y", &dimid[0]); ERR(err) err = ncmpi_inq_dimid(ncid, "X", &dimid[1]); ERR(err) @@ -283,19 +308,31 @@ int benchmark_read(char *filename, for (i=0; inelems = (rank == 0) ? 1 : 0; + if (vars->nelems == 0) vars->count[0] = 0; } else if (ndims == 2) { err = cdl_hdr_inq_dim(hid, dimids[1], NULL, &dim1); CHECK_ERR("cdl_hdr_inq_dim") vars->nelems = (rank == 0) ? dim1 : 0; + if (vars->nelems == 0) vars->count[0] = 0; vars->count[1] = dim1; /* dimension dim1 is not partitioned */ vars->start[1] = 0; /* dimension dim1 is not partitioned */ } @@ -286,7 +308,7 @@ int inquire_vars(int ncid, my_start_x += latitude % psizes[1]; } - if (debug) { + if (verbose && debug) { printf("%2d: rank (%2d, %2d) start %4lld %4lld count %4lld %4lld\n", rank, my_rank_y, my_rank_x, my_start_y, my_start_x, my_count_y, my_count_x); fflush(stdout); @@ -343,10 +365,13 @@ int inquire_vars(int ncid, vars->count[0] = 1; /* time dimension */ /* In WRF, the first dimension is always NC_UNLIMITED */ - if (ndims == 1) + if (ndims == 1) { vars->nelems = (rank == 0) ? 1 : 0; + if (vars->nelems == 0) vars->count[0] = 0; + } else if (ndims == 2) { vars->nelems = (rank == 0) ? dim1 : 0; + if (vars->nelems == 0) vars->count[0] = 0; vars->count[1] = dim1; /* dimension dim1 is not partitioned */ vars->start[1] = 0; /* dimension dim1 is not partitioned */ } @@ -396,16 +421,19 @@ int def_dims_vars(int ncid, { char *name; void *value; - int i, j, err=NC_NOERR, ndims, nvars, nattrs; + int i, j, err=NC_NOERR, rank, ndims, nvars, nattrs; MPI_Offset size, nelems; nc_type xtype; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + /* define dimensions */ /* retrieve the number of dimensions defined in the CDL file */ err = cdl_hdr_inq_ndims(hid, &ndims); CHECK_ERR("cdl_hdr_inq_ndims") - if (debug) printf("dim: ndims %d\n", ndims); + if (verbose && debug && rank == 0) + printf("dim: ndims %d\n", ndims); for (i=0; ixtype, ndims, nattrs); - for (j=0; jdimids[j]); + if (verbose && debug && rank == 0) { + printf("\t var name %s type %s ndims %d nattr %d\n", + vars->name, str_NC_type(vars->xtype), vars->ndims, nattrs); + for (j=0; jndims; j++) { + char dname[64]; + ncmpi_inq_dimname(ncid, vars->dimids[j], dname); + printf("\t\tdimid %d, name: %s\n",vars->dimids[j], dname); + } } for (j=0; jvarid, name, xtype, nelems, value); @@ -470,90 +502,67 @@ int def_dims_vars(int ncid, /* retrieve the number of global attributes */ err = cdl_hdr_inq_nattrs(hid, NC_GLOBAL, &nattrs); CHECK_ERR("cdl_hdr_inq_nattrs") - if (debug) printf("global attrs: nattrs %d\n", nattrs); + if (verbose && debug && rank == 0) + printf("global attrs: nattrs %d\n", nattrs); for (i=0; i= ntimes */ + err = ncmpi_inq_unlimdim(ncid, &unlimdimid); + CHECK_ERR("ncmpi_inq_unlimdim") + err = ncmpi_inq_dimlen(ncid, unlimdimid, &dim_len); + CHECK_ERR("ncmpi_inq_dimlen") + if (dim_len < ntimes) { + if (rank == 0) + fprintf(stderr, "Error: input file expects to have at least %d time records but got %lld\n", ntimes, dim_len); + err = NC_EIO; + goto err_out; + } + err = ncmpi_inq_nvars(ncid, &nvars); CHECK_ERR("ncmpi_inq_nvars") @@ -793,48 +1002,11 @@ int wrf_r_benchmark(char *in_file, err = inquire_vars(ncid, vars, psizes, longitude, latitude, &buf_size); CHECK_ERR("inquire_vars") - if (debug) { + if (verbose && debug && rank == 0) printf("%2d: buf_size %lld\n", rank, buf_size); - fflush(stdout); - } - /* allocate and initialize read buffers */ - MPI_Offset mem_alloc; - if (debug) mem_alloc = 0; - - for (i=0; i= ntimes */ vars[i].start[0] = j; - if (vars[i].xtype == NC_FLOAT) - err = ncmpi_iget_vara_float(ncid, vars[i].varid, vars[i].start, - vars[i].count, vars[i].buf, NULL); - else if (vars[i].xtype == NC_INT) - err = ncmpi_iget_vara_int(ncid, vars[i].varid, vars[i].start, - vars[i].count, vars[i].buf, NULL); - else if (vars[i].xtype == NC_CHAR) - err = ncmpi_iget_vara_text(ncid, vars[i].varid, vars[i].start, - vars[i].count, vars[i].buf, NULL); + if (vars[i].xtype == NC_FLOAT) { + if (blocking) + err = ncmpi_get_vara_float_all(ncid, vars[i].varid, + vars[i].start, vars[i].count, + vars[i].buf); + else + err = ncmpi_iget_vara_float(ncid, vars[i].varid, + vars[i].start, vars[i].count, + vars[i].buf, NULL); + } + else if (vars[i].xtype == NC_INT) { + if (blocking) + err = ncmpi_get_vara_int_all(ncid, vars[i].varid, + vars[i].start, vars[i].count, + vars[i].buf); + else + err = ncmpi_iget_vara_int(ncid, vars[i].varid, + vars[i].start, vars[i].count, + vars[i].buf, NULL); + } + else if (vars[i].xtype == NC_CHAR) { + if (blocking) + err = ncmpi_get_vara_text_all(ncid, vars[i].varid, + vars[i].start, vars[i].count, + vars[i].buf); + else + err = ncmpi_iget_vara_text(ncid, vars[i].varid, + vars[i].start, vars[i].count, + vars[i].buf, NULL); + } CHECK_ERR(vars[i].name) } @@ -872,16 +1064,18 @@ int wrf_r_benchmark(char *in_file, timing[2] += end_t - start_t; start_t = end_t; - if (debug && rank == 0) { + if (verbose && debug && rank == 0) { printf("Flush read requests at end of iteration j=%d\n",j); fflush(stdout); } - /* flush all nonblocking read requests */ - err = ncmpi_wait_all(ncid, NC_REQ_ALL, NULL, NULL); - CHECK_ERR("ncmpi_wait_all") - end_t = MPI_Wtime(); - timing[3] += end_t - start_t; + if (!blocking) { + /* flush all nonblocking read requests */ + err = ncmpi_wait_all(ncid, NC_REQ_ALL, NULL, NULL); + CHECK_ERR("ncmpi_wait_all") + end_t = MPI_Wtime(); + timing[3] += end_t - start_t; + } } /* obtain the accumulated data amount read by this rank */ @@ -904,7 +1098,7 @@ int wrf_r_benchmark(char *in_file, char value[MPI_MAX_INFO_VAL+1]; int flag; - printf("-----------------------------------------------------------\n"); + printf("\n-----------------------------------------------------------\n"); printf("---- WRF-IO read benchmark ----\n"); printf("Input NetCDF file name: %s\n", in_file); printf("Number of MPI processes: %d\n", nprocs); @@ -915,11 +1109,19 @@ int wrf_r_benchmark(char *in_file, printf("Total read amount: %lld B\n", sum_r_size); printf(" %.2f MiB\n", (float)sum_r_size/1048576); printf(" %.2f GiB\n", (float)sum_r_size/1073741824); + if (blocking) + printf("Using PnetCDF blocking APIs\n"); + else + printf("Using PnetCDF non-blocking APIs\n"); double bw = (double)sum_r_size / 1048576; printf("Max open-to-close time: %.4f sec\n", max_t[0]); printf("Max inquire metadata time: %.4f sec\n", max_t[1]); - printf("Max iget posting time: %.4f sec\n", max_t[2]); - printf("Max wait_all time: %.4f sec\n", max_t[3]); + if (blocking) + printf("Max get time: %.4f sec\n", max_t[2]); + else { + printf("Max iget posting time: %.4f sec\n", max_t[2]); + printf("Max wait_all time: %.4f sec\n", max_t[3]); + } printf("Read bandwidth: %.2f MiB/s\n", bw/max_t[0]); printf(" %.2f GiB/s\n", bw/1024.0/max_t[0]); printf("-----------------------------------------------------------\n"); @@ -935,6 +1137,8 @@ int wrf_r_benchmark(char *in_file, printf("MPI-IO hint cb_config_list: %s\n", HINT); MPI_Info_get(info_used, "cb_node_list", MPI_MAX_INFO_VAL, value, &flag); printf("MPI-IO hint cb_node_list: %s\n", HINT); + MPI_Info_get(info_used, "nc_pncio", MPI_MAX_INFO_VAL, value, &flag); + printf("PnetCDF hint nc_pncio: %s\n", HINT); MPI_Info_get(info_used, "nc_num_aggrs_per_node",MPI_MAX_INFO_VAL, value, &flag); printf("PnetCDF hint nc_num_aggrs_per_node: %s\n", HINT); MPI_Info_get(info_used, "nc_ina_node_list", MPI_MAX_INFO_VAL, value, &flag); @@ -942,7 +1146,7 @@ int wrf_r_benchmark(char *in_file, MPI_Info_get(info_used, "cray_cb_nodes_multiplier", MPI_MAX_INFO_VAL, value, &flag); printf("Hint cray_cb_nodes_multiplier: %s\n", HINT); MPI_Info_get(info_used, "cray_cb_write_lock_mode", MPI_MAX_INFO_VAL, value, &flag); - printf("Hint cray_cb_write_lock_mode: %s\n", HINT); + printf("Hint cray_cb_write_lock_mode: %s\n", HINT); printf("-----------------------------------------------------------\n"); } MPI_Info_free(&info_used); @@ -958,6 +1162,240 @@ int wrf_r_benchmark(char *in_file, } if (err != NC_NOERR) return err; + /* check if there is any PnetCDF internal malloc residue */ + MPI_Offset malloc_size, sum_size; + err = ncmpi_inq_malloc_size(&malloc_size); + if (err == NC_ENOTENABLED) /* --enable-profiling is not set at configure */ + return NC_NOERR; + else if (err == NC_NOERR) { + MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); + if (rank == 0 && sum_size > 0) + printf("heap memory allocated by PnetCDF internally has %lld bytes yet to be freed\n", + sum_size); + if (malloc_size > 0) ncmpi_inq_malloc_list(); + } + /* report the PnetCDF internal heap memory allocation high water mark */ + err = ncmpi_inq_malloc_max_size(&malloc_size); + if (err == NC_NOERR) { + MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_MAX, 0, MPI_COMM_WORLD); + if (verbose && rank == 0) + printf("Max heap memory allocated by PnetCDF internally is %.2f MiB\n\n", + (float)sum_size/1048576); + } + fflush(stdout); + + return err; +} + +static +int grow_header_benchmark(char *in_file) +{ + char value[MPI_MAX_INFO_VAL], cb_node_list[MPI_MAX_INFO_VAL], *attr; + int i, err=NC_NOERR, nprocs, rank, ncid, ndims, dimid[3]; + int varid, unlimdimid, nvars, fix_nvars, rec_nvars, len, flag; + double timing, max_t; + MPI_Offset hdr_size, hdr_extent, attr_len, num_rec, longitude, latitude; + MPI_Offset r_amnt[2], w_amnt[2], amnt[2], sum_amnt[2], fix_off, rec_off; + MPI_Offset rec_size, nc_data_move_chunk_size; + MPI_Info info; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + /* open input file */ + err = ncmpi_open(MPI_COMM_WORLD, in_file, NC_WRITE, MPI_INFO_NULL, &ncid); + if (err != NC_NOERR) { + printf("Error at line=%d: opening file %s (%s)\n", + __LINE__, in_file, ncmpi_strerror(err)); + goto err_out; + } + + err = ncmpi_inq_dimid(ncid, "Time", &dimid[0]); + CHECK_ERR("ncmpi_inq_dimid") + err = ncmpi_inq_dimid(ncid, "south_north", &dimid[1]); + CHECK_ERR("ncmpi_inq_dimid") + err = ncmpi_inq_dimlen(ncid, dimid[1], &longitude); + CHECK_ERR("ncmpi_inq_dimlen") + err = ncmpi_inq_dimid(ncid, "west_east", &dimid[2]); + CHECK_ERR("ncmpi_inq_dimid") + err = ncmpi_inq_dimlen(ncid, dimid[2], &latitude); + CHECK_ERR("ncmpi_inq_dimlen") + + err = ncmpi_inq_header_size(ncid, &hdr_size); + CHECK_ERR("ncmpi_inq_header_size") + err = ncmpi_inq_header_extent(ncid, &hdr_extent); + CHECK_ERR("ncmpi_inq_header_extent") + if (verbose && debug && rank == 0) + printf("Line %d: header size %lld extent %lld free space %lld\n", + __LINE__,hdr_size,hdr_extent,hdr_extent-hdr_size); + + /* check number of records in input file */ + err = ncmpi_inq_unlimdim(ncid, &unlimdimid); + CHECK_ERR("ncmpi_inq_unlimdim") + err = ncmpi_inq_dimlen(ncid, unlimdimid, &num_rec); + CHECK_ERR("ncmpi_inq_dimlen") + + err = ncmpi_inq_nvars(ncid, &nvars); + CHECK_ERR("ncmpi_inq_nvars") + + fix_nvars = 0; + rec_nvars = 0; + fix_off = -1; + rec_off = -1; + for (i=0; i 0 && r_amnt[0] > 0) { + printf("Line %d: rank %d r_amnt expect 0 but got %lld\n", + __LINE__,rank,r_amnt[0]); + err = 1; + } + if (rank > 0 && w_amnt[0] > 0) { + printf("Line %d: rank %d w_amnt expect 0 but got %lld\n", + __LINE__,rank,w_amnt[0]); + err = 1; + } + MPI_Allreduce(MPI_IN_PLACE, &err, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); + if (err > 0) goto err_out; + + /* start the timer */ + MPI_Barrier(MPI_COMM_WORLD); + timing = MPI_Wtime(); + + err = ncmpi__enddef(ncid, 0, 0, 0, 0); + CHECK_ERR("ncmpi_enddef") + + timing = MPI_Wtime() - timing; + + err = ncmpi_inq_get_size(ncid, &r_amnt[1]); + CHECK_ERR("ncmpi_inq_get_size") + err = ncmpi_inq_put_size(ncid, &w_amnt[1]); + CHECK_ERR("ncmpi_inq_put_size") + + err = ncmpi_inq_header_size(ncid, &hdr_size); + CHECK_ERR("ncmpi_inq_header_size") + err = ncmpi_inq_header_extent(ncid, &hdr_extent); + CHECK_ERR("ncmpi_inq_header_extent") + if (verbose && debug && rank == 0) + printf("Line %d: header size %lld extent %lld free space %lld\n", + __LINE__,hdr_size,hdr_extent,hdr_extent-hdr_size); + + /* fill the new record variable, so ncmpidiff can run and check */ + for (i=0; i 0); + return (err != NC_NOERR); } diff --git a/configure.ac b/configure.ac index 1426121047..e65563916b 100644 --- a/configure.ac +++ b/configure.ac @@ -15,7 +15,7 @@ dnl AC_REVISION([$Revision$])dnl dnl autoconf v2.70 and later is required. See https://github.com/Parallel-NetCDF/PnetCDF/issues/94 dnl autoconf v2.70 was released in 2021-01-28 AC_PREREQ([2.70]) -AC_INIT([PnetCDF], [1.14.1], +AC_INIT([PnetCDF], [1.15.0-alpha], [parallel-netcdf@mcs.anl.gov], [pnetcdf], [https://parallel-netcdf.github.io]) @@ -69,8 +69,8 @@ AM_EXTRA_RECURSIVE_TARGETS([tests]) dnl parse the version numbers to 4 env variables PNETCDF_VERSION_MAJOR=`echo ${PACKAGE_VERSION} | cut -d. -f1` PNETCDF_VERSION_MINOR=`echo ${PACKAGE_VERSION} | cut -d. -f2` -PNETCDF_VERSION_SUB=`echo ${PACKAGE_VERSION} | cut -d. -f3` -PNETCDF_VERSION_PRE=`echo ${PACKAGE_VERSION} | cut -d. -f4` +PNETCDF_VERSION_SUB=`echo ${PACKAGE_VERSION} | cut -d. -f3 | cut -d'-' -f1` +PNETCDF_VERSION_PRE=`echo ${PACKAGE_VERSION} | cut -d'-' -f2` dnl Note major, minor, and sub are required, but pre is not. PNETCDF_VERSION=${PACKAGE_VERSION} @@ -141,6 +141,9 @@ dnl AH_TEMPLATE([ENABLE_IN_PLACE_SWAP], [Define if to enable in-place byte swap] dnl AH_TEMPLATE([DISABLE_IN_PLACE_SWAP],[Define if to disable in-place byte swap]) AH_TEMPLATE([ENABLE_SUBFILING], [Define if to enable subfiling feature]) AH_TEMPLATE([ENABLE_NETCDF4], [Define if to enable NetCDF-4 support]) +AH_TEMPLATE([ENABLE_CHUNKING], [Define if to enable chunked storage layout and chunking feature]) +AH_TEMPLATE([ENABLE_ZLIB], [Define if to enable zlib chunking method]) +AH_TEMPLATE([ENABLE_SZ], [Define if to enable sz chunking method]) AH_TEMPLATE([ENABLE_ADIOS], [Define if to enable ADIOS BP read feature]) AH_TEMPLATE([HDF5_VER_GE_1_10_4], [Define if HDF5 version is at least 1.10.4]) AH_TEMPLATE([NETCDF_GE_4_5_0], [Define if NetCDF version is at least 4.5.0]) @@ -1175,6 +1178,8 @@ dnl AC_CHECK_FUNCS([memset setlocale sqrt strchr strrchr strtol]) dnl AC_CHECK_LIB([m], [tanh]) dnl UD_CHECK_LIB_MATH +AC_CHECK_HEADERS([unistd.h fcntl.h malloc.h stddef.h sys/types.h limits.h time.h dirent.h]) + dnl When using gcc based compiler with -ansi flag, AC_CHECK_FUNCS can still dnl find strdup, but AC_CHECK_DECL cannot. So we check with AC_CHECK_DECL dnl first and then check AC_CHECK_FUNCS. @@ -1377,8 +1382,11 @@ AC_CHECK_FUNCS([MPI_Type_create_subarray_c \ MPI_Type_get_true_extent_c \ MPI_Type_get_envelope_c \ MPI_Type_get_contents_c \ + MPI_Status_set_elements_x \ MPI_Bcast_c \ MPI_Get_count_c \ + MPI_Isend_c \ + MPI_Irecv_c \ MPI_Pack_c \ MPI_Unpack_c \ MPI_File_read_at_c \ @@ -1459,6 +1467,14 @@ if test "$mpi_version" -ge "3" ; then [], [], [[#include ]]) fi +# check some MPI combiner types that are used internally in PnetCDF +UD_CHECK_MPI_CONSTANTS([MPI_COMBINER_DUP, + MPI_COMBINER_SUBARRAY, + MPI_COMBINER_DARRAY, + MPI_COMBINER_INDEXED_BLOCK, + MPI_COMBINER_HINDEXED_BLOCK], + [], [], [[#include ]]) + dnl Check presence of various MPI error classes. Introduced in MPI 2.0. dnl These could be enums, so we have to do compile checks. dnl AC_CHECK_DECLS([MPI_ERR_FILE_EXISTS, @@ -1521,6 +1537,97 @@ dnl UD_CHECK_MPI_DATATYPE(MPI_REAL8) dnl first defined in MPI 1.0 dnl UD_CHECK_MPI_DATATYPE(MPI_DOUBLE_PRECISION) dnl first defined in MPI 1.0 dnl fi +AC_MSG_CHECKING([whether MPI_Waitall takes MPI_STATUSES_IGNORE]) +if test "x${GCC}" = xyes; then + saved_CFLAGS=${CFLAGS} + CFLAGS="-Werror -Wstringop-overflow=2" + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]],[[ + int count; + MPI_Request *reqs; + MPI_Waitall(count, reqs, MPI_STATUSES_IGNORE); + ]])], [MPI_STATUSES_IGNORE=yes], [MPI_STATUSES_IGNORE=no]) + CFLAGS=${saved_CFLAGS} +else + AC_CHECK_DECL([MPI_STATUSES_IGNORE], [MPI_STATUSES_IGNORE=yes], [MPI_STATUSES_IGNORE=no] [[#include ]]) +fi +AC_MSG_RESULT([$MPI_STATUSES_IGNORE]) +if test "x$MPI_STATUSES_IGNORE" = xyes ; then + AC_DEFINE(HAVE_MPI_STATUSES_IGNORE, 1, [Whether MPI_Waitall takes argument MPI_STATUSES_IGNORE]) +fi + +# +# Check for statfs (many) and specifically f_fstypename field (BSD) +# +AC_CHECK_HEADERS(sys/vfs.h sys/param.h sys/mount.h sys/statvfs.h sys/stat.h sys/type.h unistd.h) + +AC_CHECK_FUNCS([statvfs statfs stat]) + +AC_CHECK_MEMBERS([struct statvfs.f_basetype, + struct statfs.f_fstypename, + struct statfs.f_type, + struct stat.st_fstype],[],[], + AC_INCLUDES_DEFAULT + [#ifdef HAVE_SYS_VFS_H + #include + #endif + #ifdef HAVE_SYS_PARAM_H + #include + #endif + #ifdef HAVE_SYS_MOUNT_H + #include + #endif + #ifdef HAVE_SYS_STATFS_H + #include + #endif + #ifdef HAVE_SYS_STAT_H + #include + #endif + #ifdef HAVE_SYS_TYPE_H + #include + #endif + #ifdef HAVE_UNISTD_H + #include + #endif + ]) + +AC_CHECK_TYPE([blksize_t],[],[AC_DEFINE_UNQUOTED([blksize_t],[__blksize_t],[Provide blksize_t if not available]) ], [[ + #ifdef HAVE_SYS_TYPES_H + #include + #endif + #ifdef HAVE_SYS_STAT_H + #include + #endif + #ifdef HAVE_UNISTD_H + #include + #endif]] ) + +AC_CHECK_DECLS([pwrite]) + +# +# Check if Lustre is available by verifying presence of lustre/lustre_user.h +# +has_lustre=no +AC_CHECK_HEADERS([lustre/lustre_user.h linux/lustre/lustre_user.h], + [has_lustre=yes ; break]) +if test "x$has_lustre" = xyes ; then + AC_DEFINE(HAVE_LUSTRE, 1, [Define for LUSTRE]) + LIBS="$LIBS -llustreapi" + # llapi_get_obd_count() can get the total number of available OSTs + AC_CHECK_FUNCS([llapi_get_obd_count]) +fi +AM_CONDITIONAL(HAVE_LUSTRE, [test x$has_lustre = xyes]) + +minicking_lustre=no +if test "x$has_lustre" = xno ; then + AC_MSG_CHECKING([for whether mimicking Lustre]) + if test "x$MIMIC_LUSTRE" = xyes ; then + AC_DEFINE(MIMIC_LUSTRE, 1, [Define for mimicking LUSTRE file system]) + minicking_lustre=yes + fi + AC_MSG_RESULT($minicking_lustre) +fi +AM_CONDITIONAL(MIMIC_LUSTRE, [test x$minicking_lustre = xyes]) + AC_C_CHAR_UNSIGNED AC_C_BIGENDIAN AM_CONDITIONAL(IS_BIGENDIAN, [test x$ac_cv_c_bigendian = xyes]) @@ -1739,8 +1846,15 @@ if test "x${debug}" = xyes; then if test "x$?" != x0 ; then CFLAGS="$CFLAGS -g" fi - CFLAGS=`echo $CFLAGS | ${SED} 's/-O. *//g' | ${SED} 's/-fast *//g'` - CFLAGS="$CFLAGS -O0" + + # remove -fast if set by user + CFLAGS=`echo $CFLAGS | ${SED} 's/-fast *//g'` + + # check if -O is set by user, if not, then add -O0 + str_found=`echo "${CFLAGS}" | ${EGREP} -- "-O"` + if test "x$str_found" = x ; then + CFLAGS="$CFLAGS -O0" + fi if test "x${has_mpicxx}" = xyes ; then str_found=`echo "${CXXFLAGS}" | ${EGREP} -- "-g"` @@ -2395,6 +2509,135 @@ fi AC_SUBST(ENABLE_BURST_BUFFER) AM_CONDITIONAL(ENABLE_BURST_BUFFER, [test x$enable_bbdriver = xyes]) +AC_ARG_ENABLE([chunking], + [AS_HELP_STRING([--enable-chunking], + [Enable chunked chunking driver support. @<:@default: disabled@:>@])], + [enable_chunking=${enableval}], [enable_chunking=no] +) + +ENABLE_CHUNKING=0 +if test "x$enable_chunking" = "xyes" ; then + AC_DEFINE(ENABLE_CHUNKING) + ENABLE_CHUNKING=1 +fi +AC_SUBST(ENABLE_CHUNKING) +AM_CONDITIONAL(ENABLE_CHUNKING, [test x$enable_chunking = xyes]) + +AC_ARG_ENABLE([zlib], + [AS_HELP_STRING([--enable-zlib], + [Enable zlib chunking method support. @<:@default: disabled@:>@])], + [enable_zlib=${enableval}], [enable_zlib=no] +) + +ENABLE_ZLIB=0 +if test "x$enable_zlib" = "xyes" ; then + AC_DEFINE(ENABLE_ZLIB) + ENABLE_ZLIB=1 +fi +AC_SUBST(ENABLE_ZLIB) +AM_CONDITIONAL(ENABLE_ZLIB, [test x$enable_zlib = xyes]) + +if test "x$enable_zlib" = "xyes" ; then + ZLIB_INSTALL="" + AC_ARG_WITH(zlib, + [AS_HELP_STRING([--with-zlib=/path/to/implementation], + [installation prefix for zlib implementation])], + if test "x${withval}" = xyes; then + AC_MSG_ERROR(--with-zlib is set but the value is NULL) + else + ZLIB_INSTALL=${withval} + fi + ) + + if test "x${ZLIB_INSTALL}" != x ; then + CPPFLAGS+=" -I${ZLIB_INSTALL}/include" + LDFLAGS+=" -L${ZLIB_INSTALL}/lib" + LIBS+=" -lz" + fi + + LIBS+=" -lm -ldl" + + have_zlib=no + AC_MSG_CHECKING(ZLIB library) + AC_SEARCH_LIBS([deflate], [z], [have_zlib=yes], [have_zlib=no]) + if test "x${have_zlib}" = xyes; then + AC_CHECK_HEADERS([zlib.h], [], [have_zlib=no]) + fi + + if test "x${have_zlib}" = xno; then + AC_MSG_ERROR([ + ------------------------------------------------------------ + The ZLIB library and header file are required to build + PnetCDF with ZLIB chunking support. Use option + --with-zlib=/path/to/implementation + to specify the location of ZLIB build. + Stopping ... + Check 'config.log' for more information. + ------------------------------------------------------------]) + fi +fi + +AC_ARG_ENABLE([sz], + [AS_HELP_STRING([--enable-sz], + [Enable sz chunking method support. @<:@default: disabled@:>@])], + [enable_sz=${enableval}], [enable_sz=no] +) + +ENABLE_SZ=0 +if test "x$enable_sz" = "xyes" ; then + AC_DEFINE(ENABLE_SZ) + ENABLE_SZ=1 +fi +AC_SUBST(ENABLE_SZ) +AM_CONDITIONAL(ENABLE_SZ, [test x$enable_sz = xyes]) + +has_compression=0 +if test "x${have_zlib}" = xyes || test "x$enable_sz" = "xyes" ; then + has_compression=1 +fi +AC_DEFINE(ENABLE_COMPRESSION, [$has_compression], [Defined if compression is enabled]) +AM_CONDITIONAL(ENABLE_COMPRESSION, [test x$has_compression = x1]) + +if test "x$enable_sz" = "xyes" ; then + SZ_INSTALL="" + AC_ARG_WITH(sz, + [AS_HELP_STRING([--with-sz=/path/to/implementation], + [installation prefix for sz implementation])], + if test "x${withval}" = xyes; then + AC_MSG_ERROR(--with-sz is set but the value is NULL) + else + SZ_INSTALL=${withval} + fi + ) + + if test "x${SZ_INSTALL}" != x ; then + CPPFLAGS+=" -I${SZ_INSTALL}/include" + LDFLAGS+=" -L${SZ_INSTALL}/lib" + LIBS+=" -lSZ -lzstd" + fi + + LIBS+=" -lm -ldl" + + have_sz=no + AC_MSG_CHECKING(SZ library) + AC_SEARCH_LIBS([deflate], [z], [have_sz=yes], [have_sz=no]) + if test "x${have_sz}" = xyes; then + AC_CHECK_HEADERS([sz.h], [], [have_sz=no]) + fi + + if test "x${have_sz}" = xno; then + AC_MSG_ERROR([ + ------------------------------------------------------------ + The SZ library and header file are required to build + PnetCDF with SZ chunking support. Use option + --with-sz=/path/to/implementation + to specify the location of SZ build. + Stopping ... + Check 'config.log' for more information. + ------------------------------------------------------------]) + fi +fi + ADIOS_INSTALL="" AC_ARG_WITH(adios, [AS_HELP_STRING([--with-adios@<:@=DIR@:>@], @@ -2603,10 +2846,10 @@ else # no name prefix end with ':' FSTYPE_PREFIX= else - # check if name prefix is one of file system types known to ROMIO - romio_known_fstypes=(ufs nfs xfs pvfs2 gpfs panfs lustre daos testfs ime quobyte) + # check if name prefix is one of file system types known to PNCIO + known_fstypes=(ufs nfs xfs pvfs2 gpfs panfs lustre daos testfs ime quobyte) known_fstype= - for pre in $romio_known_fstypes ; do + for pre in $known_fstypes ; do if test "$FSTYPE_PREFIX" = $pre ; then known_fstype=$pre break @@ -2622,6 +2865,7 @@ else fi fi AC_SUBST([FSTYPE_PREFIX]) +AC_DEFINE_UNQUOTED(TESTOUTDIR, ["$TESTOUTDIR"], [Output directory for tests]) # SEQ_CC is used to compile programs to be run sequentially, such as # pnetcdf_version, ncoffsets, and ncvalidator @@ -2691,7 +2935,7 @@ dnl Update the version information only immediately before a public release. dnl PnetCDF starts with 1:0:0 (shared library is first supported in 1.9.0) dnl because some package distributors, such as Debian, may have already built dnl PnetCDF with shared libraries. -ABIVERSION="7:0:0" +ABIVERSION="8:0:1" AC_SUBST(ABIVERSION) if test "$enable_versioning" = "yes" ; then ABIVERSIONFLAGS="-version-info \$(ABIVERSION)" @@ -2711,9 +2955,11 @@ AC_CONFIG_FILES(Makefile \ src/drivers/common/Makefile \ src/drivers/include/Makefile \ src/drivers/ncmpio/Makefile \ + src/drivers/pncio/Makefile \ src/drivers/nc4io/Makefile \ src/drivers/ncadios/Makefile \ src/drivers/ncbbio/Makefile \ + src/drivers/ncchunkio/Makefile \ src/drivers/ncfoo/Makefile \ src/binding/Makefile \ src/binding/cxx/Makefile \ diff --git a/doc/README.Chunk.md b/doc/README.Chunk.md new file mode 100644 index 0000000000..fea6255cff --- /dev/null +++ b/doc/README.Chunk.md @@ -0,0 +1,119 @@ +# Support variable chunking and compression + +PnetCDF contains an experimental variable chunking and compression feature +for classic NetCDF files. + +For details about its design and implementation, please refer to: +Hou, Kaiyuan, et al. "Supporting Data Compression in PnetCDF." +2021 IEEE International Conference on Big Data (Big Data). IEEE, 2021. + +## Enable variable chunking support + +* To build PnetCDF with variable chunking support + + Add `--enable-chunking` option at the configure command line. For example, + ``` + ./configure --prefix=/PnetCDF/install/path --enable-chunking + ``` +* To build deflate filter support for chunked variable + + Add `--enable-zlib` option at the configure command line. Option + `--with-zlib` can also be used to specify the installation path of + zlib if it is not in the standard locations. For example, + ``` + ./configure --prefix=/PnetCDF/install/path --enable-chunking --enable-zlib \ + --with-zlib=/zlib/install/path + ``` +* To build sz filter support for chunked variable + + Add `--enable-sz` option at the configure command line. Option + `--with-sz` can also be used to specify the installation path of + sz if it is not in the standard locations. For example, + ``` + ./configure --prefix=/PnetCDF/install/path --enable-chunking --enable-sz \ + --with-sz=/sz/install/path + ``` + +## Enable variable chunking + +To enable chunked storage layout for variables, set the file info "nc_chunking" +to "enable". The chunking feature requires 64-bit NetCDF format (CDF5). +For example, +``` + MPI_Info_create(&info); + ncmpi_create(MPI_COMM_WORLD, fname, NC_64BIT_DATA, info, &ncid); +``` +Alternatively, the file info can be set through the environment variable +"PNETCDF_HINTS". +``` +export PNETCDF_HINTS="nc_chunking=enable" +``` +When chunking is enabled, all non-scalar variables will be stored in a chunked +storage layout. Scalar variables are not chunked. + +Users can also set the default filter for chunked variables. For example, +``` + MPI_Info_set(info, "nc_chunk_default_filter", "zlib"); +``` +or +``` +export PNETCDF_HINTS="nc_chunking=enable;nc_chunk_default_filter=zlib" +``` +The available filter options are none (default), zlib (deflate), sz. + +## Define chunk dimension of variables + +Applications can use the following APIs to set and get the chunk dimension of +a variable. +``` + int ncmpi_var_set_chunk (int ncid, int varid, int *chunk_dim); + int ncmpi_var_get_chunk (int ncid, int varid, int *chunk_dim); +``` +For example: +``` + int dim[2] = {100, 100}; + int chunk_dim[2] = {10, 10}; + ncmpi_def_var (ncid, name, type, 2, dim, &varid) + ncmpi_var_set_chunk (ncid, varid, chunk_dim); +``` +For record variables, the chunk dimension along the record dimension is always +1. +The default chunk dimension is the dimension of the variable except for the +record dimension. By default, PnetCDF will create one chunk per record or +variable. + +## Define filter for chunked variables + +Applications can use the following APIs to set and get the chunk dimension of +a variable. +``` +#define NC_FILTER_NONE 0 +#define NC_FILTER_DEFLATE 2 +#define NC_FILTER_SZ 3 +int ncmpi_var_set_filter (int ncid, int varid, int filter); +int ncmpi_var_get_filter (int ncid, int varid, int *filter); +``` +For example: +``` + ncmpi_var_set_filter (ncid, varid, NC_FILTER_DEFLATE); +``` +Valid filter values are NC_FILTER_NONE (none), NC_FILTER_DEFLATE (zlib), and +NC_FILTER_SZ (sz). + + +## Known problems + +There are some limitations of the experimental variable chunking feature. + +* Only one filter can be applied to a chunked variable. Unlike HDF5 which allows + the stacking of multiple filters on chunked datasets, the current + implementation in PnetCDF only allows a single filter to be applied to a + variable. +* No per-variable option for variable chunking. If chunking is enabled, all + non-scalar variables will be chunked even if the chunk dimension is not + defined. +* Independent variable I/O is not supported. Variable read/write (get/put) + must be collective in order to maintain data consistency of filtered chunks. + Non-blocking APIs can be used to mitigate the impact of this limitation. + +Copyright (C) 2022, Northwestern University and Argonne National Laboratory + +See the COPYRIGHT notice in the top-level directory. + diff --git a/examples/C/Makefile.am b/examples/C/Makefile.am index 009095a774..e940ab09f8 100644 --- a/examples/C/Makefile.am +++ b/examples/C/Makefile.am @@ -42,6 +42,10 @@ check_PROGRAMS = collective_write \ time_var \ create_from_cdl +if ENABLE_COMPRESSION + check_PROGRAMS += chunk_compress chunk_io chunk_2D +endif + if INSTALL_EXAMPLES example_execbin_PROGRAMS = $(check_PROGRAMS) example_execbindir = $(exec_prefix)/pnetcdf_examples/C @@ -62,10 +66,21 @@ TESTS_ENVIRONMENT += export srcdir="$(srcdir)"; TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; TESTS_ENVIRONMENT += export TESTMPIRUN="$(TESTMPIRUN)"; -TESTS_ENVIRONMENT += export PNETCDF_DEBUG="$(PNETCDF_DEBUG)"; TESTS_ENVIRONMENT += export TESTPROGRAMS="$(TESTPROGRAMS)"; TESTS_ENVIRONMENT += export check_PROGRAMS="$(check_PROGRAMS)"; -TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER="$(ENABLE_BURST_BUFFER)"; + +if PNETCDF_DEBUG + TESTS_ENVIRONMENT += export PNETCDF_DEBUG=1; +endif +if ENABLE_BURST_BUFFER + TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER=1; +endif +if ENABLE_THREAD_SAFE + TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE=1; +endif +if MIMIC_LUSTRE + TESTS_ENVIRONMENT += export MIMIC_LUSTRE=1; +endif NC_FILES = $(check_PROGRAMS:%=$(TESTOUTDIR)/%.nc) \ $(check_PROGRAMS:%=$(TESTOUTDIR)/%.bb.nc) @@ -73,31 +88,39 @@ NC_FILES = $(check_PROGRAMS:%=$(TESTOUTDIR)/%.nc) \ CLEANFILES = core core.* *.gcda *.gcno *.gcov gmon.out \ $(NC_FILES) $(TESTOUTDIR)/pthread.nc.* $(TESTOUTDIR)/testfile.nc -EXTRA_DIST = parallel_run.sh run_c_examples.sh cdl_header.txt +EXTRA_DIST = run_c_examples.sh cdl_header.txt \ + parallel_run.sh chunk_compress_FLDS.c ptest ptest4: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 4 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 4 || exit 1 + $(srcdir)/../parallel_run.sh 4 "C " || exit 1 -ptest8: $(check_PROGRAMS) +ptest3: $(check_PROGRAMS) @echo "===========================================================" - @echo " $(subdir): Parallel testing on 8 MPI processes" + @echo " $(subdir): Parallel testing on 3 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 8 || exit 1 + $(srcdir)/../parallel_run.sh 3 "C " || exit 1 -ptest3: $(check_PROGRAMS) +ptest6: $(check_PROGRAMS) @echo "===========================================================" - @echo " $(subdir): Parallel testing on 3 MPI processes" + @echo " $(subdir): Parallel testing on 6 MPI processes" + @echo "===========================================================" + @$(TESTS_ENVIRONMENT) \ + $(srcdir)/../parallel_run.sh 6 "C " || exit 1 + +ptest8: $(check_PROGRAMS) + @echo "===========================================================" + @echo " $(subdir): Parallel testing on 8 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 3 || exit 1 + $(srcdir)/../parallel_run.sh 8 "C " || exit 1 -ptests: ptest3 ptest4 ptest8 -ptest2 ptest6 ptest10: +ptests: ptest4 ptest6 +ptest2 ptest10: # build check targets but not invoke tests-local: all $(check_PROGRAMS) diff --git a/examples/C/chunk_2D.c b/examples/C/chunk_2D.c new file mode 100644 index 0000000000..c199531c4c --- /dev/null +++ b/examples/C/chunk_2D.c @@ -0,0 +1,628 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * This example shows how to use the chunking and compression features of + * PnetCDF to write a 3D record variable of integer data type in parallel. It + * first defines netCDF variables, each of size NTIMES x NY x NX, where NTIMES, + * NY, and NX are predefined constant. + * + * The data partitioning pattern is a checkerboard style, along both Y and X + * dimensions. Each process writes a subarray per time record. + * + * To compile: + * mpicc -O2 chunk_2D.c -o chunk_2D \ + * -I/path/to/PnetCDF/include \ + * -I/path/to/ZLIB/include \ + * -I/path/to/SZ/include \ + * -L/path/to/PnetCDF/lib \ + * -L/path/to/ZLIB/lib \ + * -L/path/to/SZ/lib \ + * -lpnetcdf -lz -lm -ldl -lSZ -lzstd + * + * Example commands for MPI run and outputs from running ncmpidump on the + * output netCDF file produced by this example program: + * + * % mpiexec -n 4 ./chunk_2D testfile.nc + * + * % ncmpidump testfile.nc + * netcdf testfile { + * // file format: CDF-5 (big variables) + * dimensions: + * time = UNLIMITED ; // (0 currently) + * Y = 10 ; + * X = 10 ; + * _datablock_dim_0 = 131484 ; + * _datablock_dim_1 = 412 ; + * _datablock_dim_2 = 412 ; + * _datablock_dim_3 = 412 ; + * _datablock_dim_4 = 412 ; + * _datablock_dim_5 = 412 ; + * _datablock_dim_6 = 412 ; + * _datablock_dim_7 = 412 ; + * variables: + * int var_0 ; + * var_0:_ndim = 3 ; + * var_0:_dimids = 0, 1, 2 ; + * var_0:_datatype = 4 ; + * var_0:_varkind = 1 ; + * var_0:_chunkdim = 1, 5, 5 ; + * var_0:_filter = 2 ; + * var_0:_metaoffset = 8LL ; + * int var_1 ; + * var_1:_ndim = 3 ; + * var_1:_dimids = 0, 1, 2 ; + * var_1:_datatype = 4 ; + * var_1:_varkind = 1 ; + * var_1:_chunkdim = 1, 5, 5 ; + * var_1:_filter = 2 ; + * var_1:_metaoffset = 65544LL ; + * byte _datablock_0(_datablock_dim_0) ; + * _datablock_0:_varkind = 2 ; + * byte _datablock_1(_datablock_dim_1) ; + * _datablock_1:_varkind = 2 ; + * byte _datablock_2(_datablock_dim_2) ; + * _datablock_2:_varkind = 2 ; + * byte _datablock_3(_datablock_dim_3) ; + * _datablock_3:_varkind = 2 ; + * byte _datablock_4(_datablock_dim_4) ; + * _datablock_4:_varkind = 2 ; + * byte _datablock_5(_datablock_dim_5) ; + * _datablock_5:_varkind = 2 ; + * byte _datablock_6(_datablock_dim_6) ; + * _datablock_6:_varkind = 2 ; + * byte _datablock_7(_datablock_dim_7) ; + * _datablock_7:_varkind = 2 ; + * + * // global attributes: + * :_comressed = 1 ; + * :_nwrite = 8 ; + * :_recsize = 8LL ; + * } + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include +#include +#include /* strcpy(), strncpy() */ +#include /* getopt() */ +#include +#include + +#define NTIMES 8 +#define NY 10 +#define NX 10 +#define NVARS 2 + +static int verbose; + +#define PNC_ERR(fname) { \ + if (err != NC_NOERR) { \ + printf("Error at %s:%d when calling %s (%s)\n", __FILE__,__LINE__, \ + fname, ncmpi_strerror(err)); \ + nerrs++; \ + goto err_out; \ + } \ +} + +#define MPI_ERROR(fname) { \ + if (err != MPI_SUCCESS) { \ + int errorStringLen; \ + char errorString[MPI_MAX_ERROR_STRING]; \ + MPI_Error_string(err, errorString, &errorStringLen); \ + printf("Error at %s:%d when calling %s (%s)\n", __FILE__,__LINE__, \ + fname, errorString); \ + nerrs++; \ + goto err_out; \ + } \ +} + +#define CALC_START_COUNT(len, nprocs, rank, start, count) { \ + count = len / nprocs; \ + start = count * rank; \ + if (rank < len % nprocs) { \ + start += rank; \ + count++; \ + } \ + else { \ + start += len % nprocs; \ + } \ +} + +static void +usage(char *argv0) +{ + char *help = + "Usage: %s [-h] | [-q] [-k format] [file_name]\n" + " [-h] Print help\n" + " [-q] Quiet mode (reports when fail)\n" + " [-k format] file format: 1 for CDF-1, 2 for CDF-2, 5 for CDF-5\n" + " [filename] output netCDF file name\n"; + fprintf(stderr, help, argv0); +} + +/*----< pnetcdf_check_mem_usage() >------------------------------------------*/ +/* check PnetCDF library internal memory usage */ +static int +pnetcdf_check_mem_usage(MPI_Comm comm) +{ + int err, nerrs=0, rank; + MPI_Offset malloc_size, sum_size; + + MPI_Comm_rank(comm, &rank); + + /* print info about PnetCDF internal malloc usage */ + err = ncmpi_inq_malloc_max_size(&malloc_size); + if (err == NC_NOERR) { + MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); + if (rank == 0 && verbose) + printf("maximum heap memory allocated by PnetCDF internally is %lld bytes\n", + sum_size); + + /* check if there is any PnetCDF internal malloc residue */ + err = ncmpi_inq_malloc_size(&malloc_size); + MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); + if (rank == 0 && sum_size > 0) + printf("heap memory allocated by PnetCDF internally has %lld bytes yet to be freed\n", + sum_size); + } + else if (err != NC_ENOTENABLED) { + printf("Error at %s:%d: %s\n", __FILE__,__LINE__,ncmpi_strerror(err)); + nerrs++; + } + return nerrs; +} + +/*----< compress() >--------------------------------------------------------*/ +static int +compress(MPI_Comm comm, char *filename, int cmode) +{ + char name[64]; + int i, j, rank, nprocs, err, nerrs=0, ncid, varid[NVARS]; + int dimid[3], psize[2], rank_y, rank_x; + MPI_Offset start[3], count[3]; + MPI_Info info; + + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &nprocs); + + /* Creates a division of processors in a Cartesian grid */ + psize[0] = psize[1] = 0; + err = MPI_Dims_create(nprocs, 2, psize); + MPI_ERROR("MPI_Dims_create"); + if (verbose && rank == 0) + printf("MPI_Dims_create() 2D: psize=%d %d\n", psize[0],psize[1]); + + /* set rank along X and Y */ + rank_y = rank / psize[1]; + rank_x = rank % psize[1]; + if (verbose && rank == 0) + printf("Local rank 2D: rank_y=%d rank_x=%d\n", rank_y, rank_x); + + /* set chunking (1st dimension should always be 1 for record variable) */ + int chunk_dim[3]; + chunk_dim[0] = 1; + chunk_dim[1] = NY / psize[0]; + if (NY % psize[0]) chunk_dim[1]++; + chunk_dim[2] = NX / psize[1]; + if (NX % psize[1]) chunk_dim[2]++; + if (verbose && rank == 0) + printf("chunk_dim: %d %d %d\n", chunk_dim[0],chunk_dim[1],chunk_dim[2]); + + /* set subarray start and count. Each rank writes a subarray of size + * count[0] x count[1] from offset start[0], start[1], a checkerboard + * partitioning pattern. + */ + CALC_START_COUNT(NY, psize[0], rank_y, start[1], count[1]) + CALC_START_COUNT(NX, psize[1], rank_x, start[2], count[2]) + start[0] = 0; + count[0] = 1; + if (verbose) + printf("rank %d: start=%lld %lld %lld count=%lld %lld %lld\n", rank, + start[0],start[1],start[2], count[0],count[1],count[2]); + + /* allocate write buffer of size count[1] x count[2] */ + int *buf = (int*) malloc(sizeof(int) * count[1] * count[2]); + for (i=0; i-------------------------------------------------------*/ +static int +decompress(MPI_Comm comm, char *filename) +{ + char name[64]; + int i, j, rank, nprocs, err, nerrs=0, ncid, *varid, ulimit_dimid; + int nvars, dimids[3], filter, chunk_dim[3], psize[2], rank_y, rank_x; + MPI_Offset nrecs, global_ny, global_nx; + MPI_Offset start[3], count[3]; + MPI_Info info; + + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &nprocs); + + /* Creates a division of processors in a Cartesian grid */ + psize[0] = psize[1] = 0; + err = MPI_Dims_create(nprocs, 2, psize); + MPI_ERROR("MPI_Dims_create"); + if (verbose && rank == 0) + printf("MPI_Dims_create() 2D: psize=%d %d\n", psize[0],psize[1]); + + /* set rank along X and Y */ + rank_y = rank / psize[1]; + rank_x = rank % psize[1]; + if (verbose && rank == 0) + printf("Local rank 2D: rank_y=%d rank_x=%d\n", rank_y, rank_x); + + /* open the file for reading with chunking and compression enabled */ + MPI_Info_create(&info); + MPI_Info_set(info, "nc_chunking", "enable"); + + /* chunking is supported only when MPI-IO driver is used */ + MPI_Info_set(info, "nc_pncio", "disable"); + + err = ncmpi_open(comm, filename, NC_NOWRITE, info, &ncid); + PNC_ERR("ncmpi_open") + + MPI_Info_free(&info); + + /* obtain dimension info */ + err = ncmpi_inq_dimid(ncid, "Y", &dimids[1]); + PNC_ERR("ncmpi_inq_dimlen") + + err = ncmpi_inq_dimid(ncid, "X", &dimids[2]); + PNC_ERR("ncmpi_inq_dimlen") + + err = ncmpi_inq_dimlen(ncid, dimids[1], &global_ny); + PNC_ERR("ncmpi_inq_dimlen") + + err = ncmpi_inq_dimlen(ncid, dimids[2], &global_nx); + PNC_ERR("ncmpi_inq_dimlen") + + /* obtain the number of record variables */ + err = ncmpi_inq_num_rec_vars(ncid, &nvars); + PNC_ERR("ncmpi_inq_num_rec_vars") + if (verbose && rank == 0) + printf("Number of record variables = %d\n", nvars); + + varid = (int*) malloc(sizeof(int) * nvars); + + /* obtain variable ID and dimension info */ + for (i=0; i---------------------------------------------------*/ +/* Use block-partitioning along time dimension only, i.e. each entire record of + * a variable is read by one process only. Each process may read one or more + * time records of a variable. + */ +static int +partition_time(MPI_Comm comm, char *filename) +{ + char name[64]; + int i, rank, nprocs, err, nerrs=0, ncid, *varid, ulimit_dimid; + int nvars, dimids[3], filter, chunk_dim[3]; + MPI_Offset nrecs, global_ny, global_nx; + MPI_Offset start[3], count[3]; + MPI_Info info; + + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &nprocs); + + /* open the file for reading with chunking and compression enabled */ + MPI_Info_create(&info); + MPI_Info_set(info, "nc_chunking", "enable"); + + /* chunking is supported only when MPI-IO driver is used */ + MPI_Info_set(info, "nc_pncio", "disable"); + + err = ncmpi_open(comm, filename, NC_NOWRITE, info, &ncid); + PNC_ERR("ncmpi_open") + + MPI_Info_free(&info); + + /* obtain dimension info */ + err = ncmpi_inq_dimid(ncid, "Y", &dimids[1]); + PNC_ERR("ncmpi_inq_dimlen") + + err = ncmpi_inq_dimid(ncid, "X", &dimids[2]); + PNC_ERR("ncmpi_inq_dimlen") + + err = ncmpi_inq_dimlen(ncid, dimids[1], &global_ny); + PNC_ERR("ncmpi_inq_dimlen") + + err = ncmpi_inq_dimlen(ncid, dimids[2], &global_nx); + PNC_ERR("ncmpi_inq_dimlen") + + /* obtain the number of record variables */ + err = ncmpi_inq_num_rec_vars(ncid, &nvars); + PNC_ERR("ncmpi_inq_num_rec_vars") + if (verbose && rank == 0) + printf("Number of record variables = %d\n", nvars); + + varid = (int*) malloc(sizeof(int) * nvars); + + /* obtain variable ID and dimension info */ + for (i=0; i +#include +#include /* strcpy(), strncpy() */ +#include /* getopt() */ +#include /* time() localtime(), asctime() */ +#include +#include + +#define NY 10 +#define NX 4 + +static int verbose; + +#define ERR {if(err!=NC_NOERR){printf("Error at %s:%d : %s\n", __FILE__,__LINE__, ncmpi_strerror(err));nerrs++;}} + +static void +usage(char *argv0) +{ + char *help = + "Usage: %s [-h] | [-q] [-k format] [file_name]\n" + " [-h] Print help\n" + " [-q] Quiet mode (reports when fail)\n" + " [-k format] file format: 1 for CDF-1, 2 for CDF-2, 5 for CDF-5\n" + " [filename] output netCDF file name\n"; + fprintf(stderr, help, argv0); +} + +/*----< pnetcdf_check_mem_usage() >------------------------------------------*/ +/* check PnetCDF library internal memory usage */ +static int +pnetcdf_check_mem_usage(MPI_Comm comm) +{ + int err, nerrs=0, rank; + MPI_Offset malloc_size, sum_size; + + MPI_Comm_rank(comm, &rank); + + /* print info about PnetCDF internal malloc usage */ + err = ncmpi_inq_malloc_max_size(&malloc_size); + if (err == NC_NOERR) { + MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); + if (rank == 0 && verbose) + printf("maximum heap memory allocated by PnetCDF internally is %lld bytes\n", + sum_size); + + /* check if there is any PnetCDF internal malloc residue */ + err = ncmpi_inq_malloc_size(&malloc_size); + MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); + if (rank == 0 && sum_size > 0) + printf("heap memory allocated by PnetCDF internally has %lld bytes yet to be freed\n", + sum_size); + } + else if (err != NC_ENOTENABLED) { + printf("Error at %s:%d: %s\n", __FILE__,__LINE__,ncmpi_strerror(err)); + nerrs++; + } + return nerrs; +} + +/*----< pnetcdf_io() >-------------------------------------------------------*/ +static int +pnetcdf_io(MPI_Comm comm, char *filename, int cmode) +{ + int i, j, rank, nprocs, err, nerrs=0; + int ncid, varid, dimid[3], buf[NY][NX]; + MPI_Offset global_ny, global_nx; + MPI_Offset start[3], count[3]; + MPI_Info info; + + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &nprocs); + + MPI_Info_create(&info); + MPI_Info_set(info, "nc_chunking", "enable"); + MPI_Info_set(info, "nc_chunk_default_filter", "zlib"); + + /* chunking is supported only when MPI-IO driver is used */ + MPI_Info_set(info, "nc_pncio", "disable"); + + /* create a new file for writing ----------------------------------------*/ + cmode |= NC_CLOBBER; + err = ncmpi_create(comm, filename, cmode, info, &ncid); ERR + MPI_Info_free(&info); + + /* the global array is NY * (NX * nprocs) */ + global_ny = NY; + global_nx = NX * nprocs; + + for (i=0; i 0); +} + diff --git a/examples/C/chunk_compress_FLDS.c b/examples/C/chunk_compress_FLDS.c new file mode 100644 index 0000000000..58a5493c05 --- /dev/null +++ b/examples/C/chunk_compress_FLDS.c @@ -0,0 +1,362 @@ +/********************************************************************* + * + * Copyright (C) 2026, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + * + * This examples reads the variable FLDS from an output file generated from a + * production run of E3SM Land Model, then compress it using ZLIB and writes + * to a new file. This program is designed to demonstrate the usage of the + * chunking-compression feature of PnetCDF. Note the input file can also be a + * chunked-compressed PnetCDF file. + * + * The FLDS input file has the following metadata. + * + * // file format: CDF-5 (big variables) + * dimensions: + * x = 7814 ; + * y = 8075 ; + * time = UNLIMITED ; // (248 currently) + * variables: + * float x(x) ; + * float y(y) ; + * float time(time) ; + * float FLDS(time, y, x) ; + * FLDS:long_name = "incident longwave radiation" ; + * FLDS:units = "W/m**2" ; + * } + *********************************************************************/ + +#include +#include +#include +#include /* getopt() */ +#include +#include + +static int verbose; + +#define ERR { \ + if (err != NC_NOERR) { \ + printf("Error at %s:%d : %s\n", __FILE__,__LINE__, \ + ncmpi_strerror(err)); \ + nerrs++; \ + goto err_out; \ + } \ +} + +static void +usage(char *argv0) +{ + char *help = + "Usage: %s [-h | -q | -t | -c] [-k format] -i in_file -o out_file]n" + " [-h] Print help\n" + " [-q] Quiet mode (reports when fail)\n" + " [-k format] file format: 1 for CDF-1, 2 for CDF-2, 5 for CDF-5\n" + " [-t]: data partitioning along time dimension (default: no)\n" + " [-c]: use cyclic partitioning pattern, only relevant when -t is used (default: block)\n" + " -i filename: input netCDF file name\n" + " -o filename: output netCDF file name\n"; + fprintf(stderr, help, argv0); +} + +/*----< pnetcdf_io() >-------------------------------------------------------*/ +static int +pnetcdf_io(MPI_Comm comm, + const char *in_path, + const char *out_path, + int cmode, + int div_time, + int parti) +{ + int i, rank, nprocs, err, nerrs=0; + int ncid, varid, dimid[3], ntimes, chunk_dim[3]; + float *buf = NULL, *buf_ptr; + double timing[2], max_t[2]; + MPI_Offset tlen, ylen, xlen, start[3], count[3], amnt[2], sum_amnt[2]; + MPI_Info info=MPI_INFO_NULL; + + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &nprocs); + + MPI_Info_create(&info); + MPI_Info_set(info, "nc_chunking", "enable"); + MPI_Info_set(info, "nc_chunk_default_filter", "zlib"); + + /* chunking is supported only when MPI-IO driver is used */ + MPI_Info_set(info, "nc_pncio", "disable"); + + /* open the input file file */ + err = ncmpi_open(comm, in_path, NC_NOWRITE, info, &ncid); ERR + err = ncmpi_inq_dimid(ncid, "time", &dimid[0]); ERR + err = ncmpi_inq_dimid(ncid, "y", &dimid[1]); ERR + err = ncmpi_inq_dimid(ncid, "x", &dimid[2]); ERR + + err = ncmpi_inq_dimlen(ncid, dimid[0], &tlen); ERR + err = ncmpi_inq_dimlen(ncid, dimid[1], &ylen); ERR + err = ncmpi_inq_dimlen(ncid, dimid[2], &xlen); ERR + + err = ncmpi_inq_varid(ncid, "FLDS", &varid); ERR + + if (div_time) { + /* partition along time dimension */ + chunk_dim[0] = 1; + chunk_dim[1] = ylen; + chunk_dim[2] = xlen; + + ntimes = tlen / nprocs; + if (rank < tlen % nprocs) + ntimes++; + + if (parti) { /* block partitioning */ + start[0] = (tlen / nprocs) * rank; + if (rank < tlen % nprocs) + start[0] += rank; + else + start[0] += tlen % nprocs; + } + else + start[0] = rank; /* cyclic partitioning */ + + start[1] = 0; + start[2] = 0; + + count[0] = 1; + count[1] = ylen; + count[2] = xlen; + } + else { + /* checkerboard partitioning on every time step */ + int psize[2], yrank, xrank; + + chunk_dim[0] = 1; + chunk_dim[1] = 1010; + chunk_dim[2] = 977; + + /* Creates a division of processors in a Cartesian grid */ + psize[0] = psize[1] = 0; + MPI_Dims_create(nprocs, 2, psize); + + yrank = rank / psize[1]; + xrank = rank % psize[1]; + + if (verbose) { + if (rank == 0) printf("psize %d %d\n", psize[0],psize[1]); + printf("%2d: yrank %d xrank %d\n", rank, yrank, xrank); + } + + ntimes = tlen; + + start[0] = 0; + count[0] = 1; + + count[1] = ylen / psize[0]; + start[1] = count[1] * yrank; + if (yrank < ylen % psize[0]) { + start[1] += yrank; + count[1]++; + } + else + start[1] += ylen % psize[0]; + + count[2] = xlen / psize[1]; + start[2] = count[2] * xrank; + if (xrank < xlen % psize[1]) { + start[2] += xrank; + count[2]++; + } + else + start[2] += xlen % psize[1]; + } + + if (verbose) { + printf("%2d: ntimes %d start %lld %lld %lld count %lld %lld %lld end %lld %lld\n", + rank,ntimes,start[0],start[1],start[2],count[0],count[1],count[2], + start[1]+count[1],start[2]+count[2]); + fflush(stdout); + } + + /* allocate read buffer */ + buf = (float*) malloc(sizeof(float) * ntimes*count[1]*count[2]); + + MPI_Barrier(MPI_COMM_WORLD); + timing[0] = MPI_Wtime(); + + buf_ptr = buf; + for (i=0; i 0); +} + diff --git a/examples/C/chunk_io.c b/examples/C/chunk_io.c new file mode 100644 index 0000000000..82d964b54f --- /dev/null +++ b/examples/C/chunk_io.c @@ -0,0 +1,332 @@ +/********************************************************************* + * + * Copyright (C) 2013, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + * + *********************************************************************/ + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * This example shows how to use the chunking and compression features of + * PnetCDF to write a 3D record variable of integer data type in parallel. It + * first defines a netCDF variable of size + * (NTIMES * nprocs) x (NY * nprocs) x NX + * where NTIMES, NY, and NX are predefined constant. + * The data partitioning pattern for write is along Y dimension. Each process + * writes a subarray of size (NY * NX) per record. + * + * To compile: + * mpicc -O2 chunk_io.c -o chunk_io -lpnetcdf + * + * Example commands for MPI run and outputs from running ncmpidump on the + * output netCDF file produced by this example program: + * + * % mpiexec -n 4 ./chunk_io testfile.nc + * + * % ncmpidump testfile.nc + * netcdf testfile { + * // file format: CDF-5 (big variables) + * dimensions: + * time = UNLIMITED ; // (0 currently) <-- Not used anymore + * Y = 8 ; + * X = 10 ; + * _datablock_dim_0 = 65721 ; + * _datablock_dim_1 = 185 ; + * _datablock_dim_2 = 185 ; + * _datablock_dim_3 = 185 ; + * _datablock_dim_4 = 185 ; + * _datablock_dim_5 = 185 ; + * _datablock_dim_6 = 185 ; + * _datablock_dim_7 = 185 ; + * variables: + * int var ; + * var:_ndim = 3 ; + * var:_dimids = 0, 1, 2 ; + * var:_datatype = 4 ; + * var:_varkind = 1 ; + * var:_chunkdim = 1, 2, 10 ; + * var:_filter = 2 ; + * var:_metaoffset = 4LL ; + * byte _datablock_0(_datablock_dim_0) ; + * _datablock_0:_varkind = 2 ; + * byte _datablock_1(_datablock_dim_1) ; + * _datablock_1:_varkind = 2 ; + * byte _datablock_2(_datablock_dim_2) ; + * _datablock_2:_varkind = 2 ; + * byte _datablock_3(_datablock_dim_3) ; + * _datablock_3:_varkind = 2 ; + * byte _datablock_4(_datablock_dim_4) ; + * _datablock_4:_varkind = 2 ; + * byte _datablock_5(_datablock_dim_5) ; + * _datablock_5:_varkind = 2 ; + * byte _datablock_6(_datablock_dim_6) ; + * _datablock_6:_varkind = 2 ; + * byte _datablock_7(_datablock_dim_7) ; + * _datablock_7:_varkind = 2 ; + * + * // global attributes: + * :_comressed = 1 ; + * :_nwrite = 8 ; + * :_recsize = 8LL ; + * } + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include +#include +#include /* strcpy(), strncpy() */ +#include /* getopt() */ +#include /* time() localtime(), asctime() */ +#include +#include + +#define NTIMES 2 +#define NY 2 +#define NX 10 + +static int verbose; + +#define ERR {if(err!=NC_NOERR){printf("Error at %s:%d : %s\n", __FILE__,__LINE__, ncmpi_strerror(err));nerrs++;}} + +static void +usage(char *argv0) +{ + char *help = + "Usage: %s [-h] | [-q] [-k format] [file_name]\n" + " [-h] Print help\n" + " [-q] Quiet mode (reports when fail)\n" + " [-k format] file format: 1 for CDF-1, 2 for CDF-2, 5 for CDF-5\n" + " [filename] output netCDF file name\n"; + fprintf(stderr, help, argv0); +} + +/*----< pnetcdf_check_mem_usage() >------------------------------------------*/ +/* check PnetCDF library internal memory usage */ +static int +pnetcdf_check_mem_usage(MPI_Comm comm) +{ + int err, nerrs=0, rank; + MPI_Offset malloc_size, sum_size; + + MPI_Comm_rank(comm, &rank); + + /* print info about PnetCDF internal malloc usage */ + err = ncmpi_inq_malloc_max_size(&malloc_size); + if (err == NC_NOERR) { + MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); + if (rank == 0 && verbose) + printf("maximum heap memory allocated by PnetCDF internally is %lld bytes\n", + sum_size); + + /* check if there is any PnetCDF internal malloc residue */ + err = ncmpi_inq_malloc_size(&malloc_size); + MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); + if (rank == 0 && sum_size > 0) + printf("heap memory allocated by PnetCDF internally has %lld bytes yet to be freed\n", + sum_size); + } + else if (err != NC_ENOTENABLED) { + printf("Error at %s:%d: %s\n", __FILE__,__LINE__,ncmpi_strerror(err)); + nerrs++; + } + return nerrs; +} + +/*----< compress() >--------------------------------------------------------*/ +static int +compress(MPI_Comm comm, char *filename, int cmode) +{ + int i, j, rank, nprocs, err, nerrs=0; + int ncid, varid, dimid[3]; + MPI_Offset global_ny, global_nx; + MPI_Offset start[3], count[3]; + MPI_Info info; + + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &nprocs); + + MPI_Info_create(&info); + MPI_Info_set(info, "nc_chunking", "enable"); + MPI_Info_set(info, "nc_chunk_default_filter", "zlib"); + + /* chunking is supported only when MPI-IO driver is used */ + MPI_Info_set(info, "nc_pncio", "disable"); + + /* the global array is (NTIMES * nprocs) x (NY * nprocs) x NX */ + + /* set chunking (1st dimension should always be 1 for record variable) */ + + int *buf = (int*) malloc(sizeof(int) * NY * NX); + for (i=0; i-------------------------------------------------------*/ +static int +decompress(MPI_Comm comm, char *filename) +{ + int i, j, rank, nprocs, err, nerrs=0; + int ncid, varid, dimid, filter, chunk_dim[3]; + MPI_Offset global_ny, global_nx; + MPI_Offset start[3], count[3]; + MPI_Info info; + + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &nprocs); + + MPI_Info_create(&info); + MPI_Info_set(info, "nc_chunking", "enable"); + + /* chunking is supported only when MPI-IO driver is used */ + MPI_Info_set(info, "nc_pncio", "disable"); + + /* the global array is (NTIMES * nprocs) x (NY * nprocs) * NX */ + + /* open the file for reading ----------------------------------------*/ + err = ncmpi_open(comm, filename, NC_NOWRITE, info, &ncid); ERR + + err = ncmpi_inq_varid(ncid, "var", &varid); ERR + + /* check the current record dimension size */ + MPI_Offset dim_len; + err = ncmpi_inq_unlimdim(ncid, &dimid); ERR + err = ncmpi_inq_dimlen(ncid, dimid, &dim_len); ERR + if (verbose && rank == 0) + printf("Time dimension length = %lld\n", dim_len); + + /* get chunking */ + err = ncmpi_var_get_chunk(ncid, varid, chunk_dim);; ERR + if (verbose && rank == 0) + printf("chunk_dim[3]=%d %d %d\n", + chunk_dim[0],chunk_dim[1],chunk_dim[2]); + + /* get filter */ + err = ncmpi_var_get_filter(ncid, varid, &filter); ERR + if (verbose && rank == 0) + printf("filter is %s\n", (filter == NC_FILTER_DEFLATE) ? + "NC_FILTER_DEFLATE": (filter == NC_FILTER_SZ) ? + "NC_FILTER_SZ" : "UNKNOWN"); + + /* set subarray start and count. Each rank read a whole record at a time + * for NTIMES times. Each process reads different records. + */ + start[0] = rank; start[1] = 0; start[2] = 0; + count[0] = 1; count[1] = NY*nprocs; count[2] = NX; + + if (verbose) + printf("%d: start=%lld %lld %lld count=%lld %lld %lld\n", rank, + start[0], start[1], start[2], count[0], count[1], count[2]); + + int *buf; + buf = (int*) malloc(sizeof(int) * count[0] * count[1] * count[2]); + for (j=0; j 0); +} + diff --git a/examples/C/create_from_cdl.c b/examples/C/create_from_cdl.c index a2a2ae1797..2a238153dc 100644 --- a/examples/C/create_from_cdl.c +++ b/examples/C/create_from_cdl.c @@ -168,13 +168,16 @@ int main(int argc, char **argv) err = ncmpi_def_var(ncid, name, xtype, ndims, dimids, &varid); CHECK_ERR + /* fill with default fill value */ + err = ncmpi_def_var_fill(ncid, varid, 0, NULL); + CHECK_ERR + /* retrieve metadata of attribute j associated with variable i */ err = cdl_hdr_inq_nattrs(hid, i, &nattrs); CHECK_ERR for (j=0; j 0); } diff --git a/examples/burst_buffer/Makefile.am b/examples/burst_buffer/Makefile.am index 162770fcfe..510aa5df63 100644 --- a/examples/burst_buffer/Makefile.am +++ b/examples/burst_buffer/Makefile.am @@ -27,40 +27,57 @@ DATA_FILES = $(NC_FILES:%=%_*.data) CLEANFILES = $(NC_FILES) core core.* *.gcda *.gcno *.gcov gmon.out \ $(META_FILES) $(DATA_FILES) -EXTRA_DIST = parallel_run.sh - TESTS_ENVIRONMENT = export SED="$(SED)"; TESTS_ENVIRONMENT += export srcdir="$(srcdir)"; TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; TESTS_ENVIRONMENT += export TESTMPIRUN="$(TESTMPIRUN)"; -TESTS_ENVIRONMENT += export PNETCDF_DEBUG="$(PNETCDF_DEBUG)"; TESTS_ENVIRONMENT += export TESTPROGRAMS="$(TESTPROGRAMS)"; TESTS_ENVIRONMENT += export check_PROGRAMS="$(check_PROGRAMS)"; +if PNETCDF_DEBUG + TESTS_ENVIRONMENT += export PNETCDF_DEBUG=1; +endif +if ENABLE_BURST_BUFFER + TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER=1; +endif +if ENABLE_THREAD_SAFE + TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE=1; +endif +if MIMIC_LUSTRE + TESTS_ENVIRONMENT += export MIMIC_LUSTRE=1; +endif + ptest ptest4: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 4 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 4 || exit 1 + $(srcdir)/../parallel_run.sh 4 "BB " || exit 1 -ptest8: $(check_PROGRAMS) +ptest3: $(check_PROGRAMS) @echo "===========================================================" - @echo " $(subdir): Parallel testing on 8 MPI processes" + @echo " $(subdir): Parallel testing on 3 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 8 || exit 1 + $(srcdir)/../parallel_run.sh 3 "BB "|| exit 1 -ptest3: $(check_PROGRAMS) +ptest6: $(check_PROGRAMS) @echo "===========================================================" - @echo " $(subdir): Parallel testing on 3 MPI processes" + @echo " $(subdir): Parallel testing on 6 MPI processes" + @echo "===========================================================" + @$(TESTS_ENVIRONMENT) \ + $(srcdir)/../parallel_run.sh 6 "BB "|| exit 1 + +ptest8: $(check_PROGRAMS) + @echo "===========================================================" + @echo " $(subdir): Parallel testing on 8 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 3 || exit 1 + $(srcdir)/../parallel_run.sh 8 "BB "|| exit 1 -ptests: ptest3 ptest4 ptest8 -ptest2 ptest6 ptest10: +ptests: ptest4 ptest6 +ptest2 ptest10: # build check targets but not invoke tests-local: all $(check_PROGRAMS) diff --git a/examples/burst_buffer/parallel_run.sh b/examples/burst_buffer/parallel_run.sh index 9fe960c9cf..fbbc869661 100755 --- a/examples/burst_buffer/parallel_run.sh +++ b/examples/burst_buffer/parallel_run.sh @@ -21,7 +21,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` NTHREADS=`expr $1 \* 6 - 1` # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -33,17 +33,45 @@ unset PNETCDF_HINTS for i in ${check_PROGRAMS} ; do # echo "---- exec=$i" for j in ${safe_modes} ; do - for intra_aggr in 0 1 ; do if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi + for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" else - export PNETCDF_HINTS= + USEMPIO_HINTS="nc_pncio=enable" fi + for intra_aggr in 0 1 ; do if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" + INA_HINTS="nc_num_aggrs_per_node=2" + else + INA_HINTS="nc_num_aggrs_per_node=0" + fi + + if [[ "$i" == *"vard"* ]] ; then + if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then + # vard APIs are not supported when using PNCIO + continue + fi fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + if test "x$INA_HINTS" != x ; then + PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" # echo "${MPIRUN} ./$i -q -b ${TESTOUTDIR} ${TESTOUTDIR}/$i.nc" ${MPIRUN} ./$i -q -b ${TESTOUTDIR} ${TESTOUTDIR}/$i.nc @@ -56,6 +84,7 @@ for i in ${check_PROGRAMS} ; do # echo "" done done + done # delete output files rm -f ${OUTDIR}/$i.nc done diff --git a/examples/parallel_run.sh b/examples/parallel_run.sh new file mode 100755 index 0000000000..c8388576a4 --- /dev/null +++ b/examples/parallel_run.sh @@ -0,0 +1,230 @@ +#!/bin/bash +# +# Copyright (C) 2018, Northwestern University and Argonne National Laboratory +# See COPYRIGHT notice in top-level directory. +# + +# Exit immediately if a command exits with a non-zero status. +# set -e + +DRY_RUN=no +VERBOSE=no + +exe_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + + cmd=`basename $1` + if test "x$MIMIC_LUSTRE" = x1 && test "x$cmd" = xncmpidiff ; then + # echo "export MIMIC_STRIPE_SIZE=1048576" + export MIMIC_STRIPE_SIZE=1048576 + fi + + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $MPIRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $MPIRUN $@ + fi + if test $? != 0 ; then + echo "FAIL: nprocs=$1 ---- $i $TEST_OPTS" + exit 1 + fi + + if test "x$MIMIC_LUSTRE" = x1 && test "x$cmd" = xncmpidiff ; then + # echo "unset MIMIC_STRIPE_SIZE" + unset MIMIC_STRIPE_SIZE + fi +} + +seq_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $TESTSEQRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $TESTSEQRUN $@ + fi +} + +VALIDATOR=../../src/utils/ncvalidator/ncvalidator +NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff + +MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` +# echo "MPIRUN = ${MPIRUN}" +# echo "check_PROGRAMS=${check_PROGRAMS}" + +# remove file system type prefix if there is any +OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` + +# let NTHREADS=$1*6-1 +NTHREADS=`expr $1 \* 6 - 1` + +# echo "${LINENO}: PNETCDF_DEBUG = ${PNETCDF_DEBUG}" +if test "x${PNETCDF_DEBUG}" = x1 ; then + safe_modes="0 1" +else + safe_modes="0" +fi + +# prevent user environment setting of PNETCDF_HINTS to interfere +unset PNETCDF_HINTS + +for i in ${check_PROGRAMS} ; do + # Capture start time in seconds and nanoseconds + start_time=$(date +%s.%1N) + + for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + safe_hint=" SAFE" + else + safe_hint="NOSAFE" + fi + OUT_PREFIX="${TESTOUTDIR}/$i" + + for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + DRIVER_OUT_FILE="${OUT_PREFIX}.mpio" + driver_hint=" MPIO" + else + USEMPIO_HINTS="nc_pncio=enable" + DRIVER_OUT_FILE="${OUT_PREFIX}.pncio" + driver_hint="PNCIO" + fi + + for intra_aggr in 0 1 ; do + if test "$intra_aggr" = 1 ; then + INA_HINTS="nc_num_aggrs_per_node=2" + INA_OUT_FILE="${DRIVER_OUT_FILE}.ina" + ina_hint=" INA" + else + INA_HINTS="nc_num_aggrs_per_node=0" + INA_OUT_FILE="${DRIVER_OUT_FILE}" + ina_hint="NOINA" + fi + + OUT_FILE=$INA_OUT_FILE + + if [[ "$i" == *"vard"* ]] ; then + if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then + # vard APIs are not supported when using PNCIO + continue + fi + fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS;$PNETCDF_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + if test "x$INA_HINTS" != x ; then + PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS" + fi + if test "x$MIMIC_LUSTRE" != x1 ; then + PNETCDF_HINTS="cb_nodes=2;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + fi + + TEST_OPTS="$safe_hint $driver_hint $ina_hint" + + CMD_OPT=-q + IN_FILE= + if test "$i" = create_from_cdl ; then + IN_FILE=${srcdir}/cdl_header.txt + fi + + if test "$i" = pthread ; then + # each MPI process created 6 threads + exe_cmd ./$i $CMD_OPT ${OUT_FILE}.nc + for k in `seq 0 ${NTHREADS}` ; do + seq_cmd ${VALIDATOR} -q ${OUT_FILE}.nc.$k + done + continue + elif test "$i" = put_vara ; then + exe_cmd ./$i $CMD_OPT ${OUT_FILE}.nc + seq_cmd ${VALIDATOR} -q ${OUT_FILE}.nc + + exe_cmd ./get_vara $CMD_OPT ${OUT_FILE}.nc + elif test "$i" = get_vara ; then + continue + elif test "$i" = create_from_cdl ; then + # create_from_cdl reads a CDL header file + exe_cmd ./$i $CMD_OPT -o ${OUT_FILE}.nc $IN_FILE + else + exe_cmd ./$i $CMD_OPT ${OUT_FILE}.nc + fi + + seq_cmd ${VALIDATOR} -q ${OUT_FILE}.nc + + if test "x${ENABLE_BURST_BUFFER}" = x1 ; then + saved_PNETCDF_HINTS=${PNETCDF_HINTS} + export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" + if test "$i" = create_from_cdl ; then + # create_from_cdl reads a CDL header file + exe_cmd ./$i -q -o ${OUT_FILE}.bb.nc $IN_FILE + else + exe_cmd ./$i $CMD_OPT ${OUT_FILE}.bb.nc + fi + export PNETCDF_HINTS=${saved_PNETCDF_HINTS} + + seq_cmd ${VALIDATOR} -q ${OUT_FILE}.bb.nc + + # compare file header only for large file tests + DIFF_OPT="-q" + if test "$i" = create_from_cdl ; then + DIFF_OPT+=" -h" + fi + exe_cmd ${NCMPIDIFF} $DIFF_OPT $OUT_FILE.nc $OUT_FILE.bb.nc + fi + + if test "x${ENABLE_NETCDF4}" = x1 ; then + exe_cmd ./$i ${OUT_FILE}.nc4 4 + # Validator does not support nc4 + fi + done # intra_aggr + done # mpiio_mode + + if [[ "$i" == *"vard"* ]] ; then + continue + fi + + if test "$i" = get_vara ; then + continue + fi + + DIFF_OPT="-q" + if test "$i" = create_from_cdl ; then + DIFF_OPT+=" -h" + fi + if test "$i" = pthread ; then + for j in `seq 0 ${NTHREADS}` ; do + exe_cmd $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.mpio.ina.nc.$j + exe_cmd $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.pncio.nc.$j + exe_cmd $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.pncio.nc.$j $OUT_PREFIX.pncio.ina.nc.$j + done + else + exe_cmd $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.mpio.ina.nc + exe_cmd $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.nc + exe_cmd $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.pncio.nc $OUT_PREFIX.pncio.ina.nc + fi + + done # safe_modes + rm -f ${OUTDIR}/$i*nc* + + end_time=$(date +%s.%1N) + + # Calculate difference (requires bc for floating point math) + elapsed_time=$(echo "$end_time - $start_time" | bc) + + fixed_length=48 + printf "*** TESTING %-${fixed_length}s -- pass (%4ss)\n" "$i" "$elapsed_time" + +done # check_PROGRAMS + diff --git a/examples/tutorial/Makefile.am b/examples/tutorial/Makefile.am index 20859a3ff3..c7c89b4229 100644 --- a/examples/tutorial/Makefile.am +++ b/examples/tutorial/Makefile.am @@ -67,10 +67,20 @@ TESTS_ENVIRONMENT += export srcdir="$(srcdir)"; TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; TESTS_ENVIRONMENT += export TESTMPIRUN="$(TESTMPIRUN)"; -TESTS_ENVIRONMENT += export PNETCDF_DEBUG="$(PNETCDF_DEBUG)"; TESTS_ENVIRONMENT += export TESTPROGRAMS="$(TESTPROGRAMS)"; TESTS_ENVIRONMENT += export check_PROGRAMS="$(check_PROGRAMS)"; -TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER="$(ENABLE_BURST_BUFFER)"; +if PNETCDF_DEBUG + TESTS_ENVIRONMENT += export PNETCDF_DEBUG=1; +endif +if ENABLE_BURST_BUFFER + TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER=1; +endif +if ENABLE_THREAD_SAFE + TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE=1; +endif +if MIMIC_LUSTRE + TESTS_ENVIRONMENT += export MIMIC_LUSTRE=1; +endif NC_FILES = $(check_PROGRAMS:%=$(TESTOUTDIR)/%.nc) \ $(check_PROGRAMS:%=$(TESTOUTDIR)/%.bb.nc) diff --git a/examples/tutorial/parallel_run.sh b/examples/tutorial/parallel_run.sh index bfd3e65a9e..0c3261c686 100755 --- a/examples/tutorial/parallel_run.sh +++ b/examples/tutorial/parallel_run.sh @@ -5,7 +5,24 @@ # # Exit immediately if a command exits with a non-zero status. -set -e +# set -e + +DRY_RUN=no +VERBOSE=no + +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $MPIRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $MPIRUN $@ + fi + if test $? != 0 ; then + echo "FAIL: nprocs=$1 ---- $i $TEST_OPTS" + exit 1 + fi +} VALIDATOR=../../src/utils/ncvalidator/ncvalidator NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff @@ -21,7 +38,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` NTHREADS=`expr $1 \* 6 - 1` # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -31,40 +48,65 @@ fi unset PNETCDF_HINTS for i in ${check_PROGRAMS} ; do + # Capture start time in seconds and nanoseconds + start_time=$(date +%s.%1N) + for j in ${safe_modes} ; do - for intra_aggr in 0 1 ; do if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi + for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" else - export PNETCDF_HINTS= + USEMPIO_HINTS="nc_pncio=enable" fi + for intra_aggr in 0 1 ; do if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" + INA_HINTS="nc_num_aggrs_per_node=2" + else + INA_HINTS="nc_num_aggrs_per_node=0" + fi + + if [[ "$i" == *"vard"* ]] ; then + if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then + # vard APIs are not supported when using PNCIO + continue + fi + fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" fi + if test "x$INA_HINTS" != x ; then + PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS" + fi + if test "x$MIMIC_LUSTRE" != x1 ; then + PNETCDF_HINTS="cb_nodes=2;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" if test $i = "pnetcdf-read-from-master" ; then - ${MPIRUN} ./$i ${TESTOUTDIR}/pnetcdf-write-from-master.nc + run_cmd ./$i ${TESTOUTDIR}/pnetcdf-write-from-master.nc elif test $i = "pnetcdf-read-nfiles" ; then - ${MPIRUN} ./$i ${TESTOUTDIR}/pnetcdf-write-nfiles.nc + run_cmd ./$i ${TESTOUTDIR}/pnetcdf-write-nfiles.nc elif test $i = "pnetcdf-read-standard" ; then - ${MPIRUN} ./$i ${TESTOUTDIR}/pnetcdf-write-standard.nc + run_cmd ./$i ${TESTOUTDIR}/pnetcdf-write-standard.nc elif test $i = "pnetcdf-read-flexible" ; then - ${MPIRUN} ./$i ${TESTOUTDIR}/pnetcdf-write-flexible.nc + run_cmd ./$i ${TESTOUTDIR}/pnetcdf-write-flexible.nc elif test $i = "pnetcdf-read-nb" ; then - ${MPIRUN} ./$i ${TESTOUTDIR}/pnetcdf-write-nb.nc + run_cmd ./$i ${TESTOUTDIR}/pnetcdf-write-nb.nc else - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc - fi - if test $? = 0 ; then - if test $i = "pnetcdf-write-bufferedf77" ; then - echo "PASS: F77 parallel run on $1 processes --------------- $i" - elif test $i = "pnetcdf-write-bufferedf" ; then - echo "PASS: F90 parallel run on $1 processes --------------- $i" - else - echo "PASS: C parallel run on $1 processes --------------- $i" - fi + run_cmd ./$i ${TESTOUTDIR}/$i.nc fi if test "$i" = pthread ; then @@ -95,26 +137,17 @@ for i in ${check_PROGRAMS} ; do saved_PNETCDF_HINTS=${PNETCDF_HINTS} export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" if test $i = "pnetcdf-read-from-master" ; then - ${MPIRUN} ./$i ${TESTOUTDIR}/pnetcdf-write-from-master.bb.nc + run_cmd ./$i ${TESTOUTDIR}/pnetcdf-write-from-master.bb.nc elif test $i = "pnetcdf-read-nfiles" ; then - ${MPIRUN} ./$i ${TESTOUTDIR}/pnetcdf-write-nfiles.bb.nc + run_cmd ./$i ${TESTOUTDIR}/pnetcdf-write-nfiles.bb.nc elif test $i = "pnetcdf-read-standard" ; then - ${MPIRUN} ./$i ${TESTOUTDIR}/pnetcdf-write-standard.bb.nc + run_cmd ./$i ${TESTOUTDIR}/pnetcdf-write-standard.bb.nc elif test $i = "pnetcdf-read-flexible" ; then - ${MPIRUN} ./$i ${TESTOUTDIR}/pnetcdf-write-flexible.bb.nc + run_cmd ./$i ${TESTOUTDIR}/pnetcdf-write-flexible.bb.nc elif test $i = "pnetcdf-read-nb" ; then - ${MPIRUN} ./$i ${TESTOUTDIR}/pnetcdf-write-nb.bb.nc + run_cmd ./$i ${TESTOUTDIR}/pnetcdf-write-nb.bb.nc else - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.bb.nc - fi - if test $? = 0 ; then - if test $i = "pnetcdf-write-bufferedf77" ; then - echo "PASS: F77 parallel run on $1 processes --------------- $i" - elif test $i = "pnetcdf-write-bufferedf" ; then - echo "PASS: F90 parallel run on $1 processes --------------- $i" - else - echo "PASS: C parallel run on $1 processes --------------- $i" - fi + run_cmd ./$i ${TESTOUTDIR}/$i.bb.nc fi export PNETCDF_HINTS=${saved_PNETCDF_HINTS} @@ -131,18 +164,26 @@ for i in ${check_PROGRAMS} ; do # echo "--- validating file ${TESTOUTDIR}/$i.bb.nc" ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.nc - # echo "--- ncmpidiff $i.nc $i.bb.nc ---" - ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.nc ${TESTOUTDIR}/$i.bb.nc + run_cmd ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.nc ${TESTOUTDIR}/$i.bb.nc fi fi if test "x${ENABLE_NETCDF4}" = x1 ; then - # echo "test netCDF-4 feature" - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc4 4 + run_cmd ./$i ${TESTOUTDIR}/$i.nc4 4 # Validator does not support nc4 fi done done + done + + end_time=$(date +%s.%1N) + + # Calculate difference (requires bc for floating point math) + elapsed_time=$(echo "$end_time - $start_time" | bc) + + fixed_length=48 + printf "*** TESTING %-${fixed_length}s -- pass (%4ss)\n" "$i" "$elapsed_time" + done rm -f ${OUTDIR}/pnetcdf-*.nc diff --git a/m4/foreach_idx.m4 b/m4/foreach_idx.m4 new file mode 100644 index 0000000000..fccde380cd --- /dev/null +++ b/m4/foreach_idx.m4 @@ -0,0 +1,7 @@ +divert(`-1') +# foreach_idx(x, idx, (item_1, item_2, ..., item_n), stmt) +# parenthesized list, simple version +define(`foreach_idx', `pushdef(`$1')pushdef(`$2')_foreach_idx($@,0)popdef(`$2')popdef(`$1')') +define(`_arg1', `$1') +define(`_foreach_idx', `ifelse(`$3', `()', `',`define(`$1', _arg1$3)define(`$2', `$5')$4`'$0(`$1', `$2', (shift$3), `$4',incr($5))')') +divert`'dnl \ No newline at end of file diff --git a/m4/libtool.m4 b/m4/libtool.m4 index 707b20f3e4..8d323b3ee4 100644 --- a/m4/libtool.m4 +++ b/m4/libtool.m4 @@ -115,60 +115,10 @@ func_cc_basename () compile | *[[\\/]]compile | ccache | *[[\\/]]ccache ) ;; distcc | *[[\\/]]distcc | purify | *[[\\/]]purify ) ;; \-*) ;; - mpicc | mpicxx | mpif77 | mpif90 | mpifort | *[[\\/]]mpicc | *[[\\/]]mpicxx | *[[\\/]]mpif77 | *[[\\/]]mpif90 | *[[\\/]]mpifort ) - # MPICH compilers - # eval "$cc_temp -show" < /dev/null >& conftest.ver - # func_cc_basename_result=`head -n1 conftest.ver |cut -d' ' -f1` - # ${RM} -f conftest.ver - func_cc_basename_result=`$cc_temp -show | cut -d' ' -f1 | xargs basename` - # echo "cc_temp=$cc_temp func_cc_basename_result=$func_cc_basename_result" - return - ;; - mpifccpx | mpiFCCpx | mpifrtpx | *[[\\/]]mpifccpx | *[[\\/]]mpiFCCpx | *[[\\/]]mpifrtpx ) - # MPI compilers based on Fujitsu compilers: fccpx, FCCpx, frtpx - func_cc_basename_result=`$cc_temp -showme | cut -d' ' -f1 | xargs basename` - # echo "cc_temp=$cc_temp func_cc_basename_result=$func_cc_basename_result" - return - ;; - cc | CC | ftn | *[[\\/]]cc | *[[\\/]]CC | *[[\\/]]ftn ) - # For Cray PrgEnv-intel, cc is a wrapper of icc - # For Cray PrgEnv-gnu, cc is a wrapper of gcc - # func_cc_basename_result=`$cc_temp --version |& head -n 1 | cut -d' ' -f1 | xargs basename` - eval "$cc_temp --version" < /dev/null >& conftest.ver - func_cc_basename_result=`head -n1 conftest.ver |cut -d' ' -f1` - ${RM} -f conftest.ver - if test "x${func_cc_basename_result}" = xicc || - test "x${func_cc_basename_result}" = xicpc || - test "x${func_cc_basename_result}" = xifort || - test "x${func_cc_basename_result}" = xgcc || - test "x${func_cc_basename_result}" = xg++ || - test "x${func_cc_basename_result}" = xgfortran || - test "x${func_cc_basename_result}" = xGNU ; then - # echo "cc_temp=$cc_temp func_cc_basename_result=$func_cc_basename_result" - return - fi - # For Cray PrgEnv-cray, cc is a wrapper of Cray CC - # Cray cc -V sends the output to stderr. - # func_cc_basename_result=`$cc_temp -V |& head -n 1 | cut -d' ' -f1 | xargs basename` - eval "$cc_temp -V" < /dev/null >& conftest.ver - func_cc_basename_result=`head -n1 conftest.ver |cut -d' ' -f1` - ${RM} -f conftest.ver - if test "x${func_cc_basename_result}" = xCray ; then - # echo "cc_temp=$cc_temp func_cc_basename_result=$func_cc_basename_result" - return - fi - return - ;; - mpixlc | mpixlcxx | mpixlf77 | mpixlf90 | *[[\\/]]mpixlc | *[[\\/]]mpixlcxx | *[[\\/]]mpixlf77 | *[[\\/]]mpixlf90 ) - func_cc_basename_result=`$cc_temp -show | cut -d' ' -f1 | xargs basename` - # echo "cc_temp=$cc_temp func_cc_basename_result=$func_cc_basename_result" - return - ;; *) break;; esac done func_cc_basename_result=`$ECHO "$cc_temp" | $SED "s%.*/%%; s%^$host_alias-%%"` - # echo "cc_temp=$cc_temp func_cc_basename_result=$func_cc_basename_result" } ])# _LT_PREPARE_CC_BASENAME @@ -775,7 +725,7 @@ _LT_CONFIG_SAVE_COMMANDS([ cfgfile=${ofile}T trap "$RM \"$cfgfile\"; exit 1" 1 2 15 - $RM -f "$cfgfile" + $RM "$cfgfile" cat <<_LT_EOF >> "$cfgfile" #! $SHELL @@ -1007,7 +957,7 @@ ac_outfile=conftest.$ac_objext echo "$lt_simple_compile_test_code" >conftest.$ac_ext eval "$ac_compile" 2>&1 >/dev/null | $SED '/^$/d; /^ *+/d' >conftest.err _lt_compiler_boilerplate=`cat conftest.err` -$RM -f conftest* +$RM conftest* ])# _LT_COMPILER_BOILERPLATE @@ -1062,7 +1012,7 @@ m4_defun_once([_LT_REQUIRED_DARWIN_CHECKS],[ _lt_result=$? # If there is a non-empty error log, and "single_module" # appears in it, assume the flag caused a linker warning - if test -s conftest.err && $GREP single_module conftest.err > /dev/null ; then + if test -s conftest.err && $GREP single_module conftest.err; then cat conftest.err >&AS_MESSAGE_LOG_FD # Otherwise, if the output was created with a 0 exit code from # the compiler, it worked. @@ -1200,10 +1150,7 @@ m4_defun([_LT_DARWIN_LINKER_FEATURES], _LT_TAGVAR(link_all_deplibs, $1)=yes _LT_TAGVAR(allow_undefined_flag, $1)=$_lt_dar_allow_undefined case $cc_basename in - ifort*|nagfor*) - _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' - _lt_dar_can_shared=yes - ;; + ifort*|nagfor*) _lt_dar_can_shared=yes ;; *) _lt_dar_can_shared=$GCC ;; esac if test yes = "$_lt_dar_can_shared"; then @@ -1214,8 +1161,8 @@ m4_defun([_LT_DARWIN_LINKER_FEATURES], _LT_TAGVAR(module_expsym_cmds, $1)="$SED -e 's|^|_|' < \$export_symbols > \$output_objdir/\$libname-symbols.expsym~\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags$_lt_dar_export_syms$_lt_dsymutil" m4_if([$1], [CXX], [ if test yes = "$_lt_dar_needs_single_mod" -a yes != "$lt_cv_apple_cc_single_mod"; then - _LT_TAGVAR(archive_cmds, $1)="\$CC -r -keep_private_externs $nostdlib_flag -nostdlib -o \$lib-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$lib-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring$_lt_dsymutil" - _LT_TAGVAR(archive_expsym_cmds, $1)="$SED 's|^|_|' < \$export_symbols > \$output_objdir/\$libname-symbols.expsym~\$CC -r -keep_private_externs $nostdlib_flag -nostdlib -o \$lib-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$lib-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring$_lt_dar_export_syms$_lt_dsymutil" + _LT_TAGVAR(archive_cmds, $1)="\$CC -r -keep_private_externs -nostdlib -o \$lib-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$lib-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring$_lt_dsymutil" + _LT_TAGVAR(archive_expsym_cmds, $1)="$SED 's|^|_|' < \$export_symbols > \$output_objdir/\$libname-symbols.expsym~\$CC -r -keep_private_externs -nostdlib -o \$lib-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$lib-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring$_lt_dar_export_syms$_lt_dsymutil" fi ],[]) else @@ -2190,7 +2137,7 @@ AC_CACHE_CHECK([if $compiler supports -c -o file.$ac_objext], $RM out/* && rmdir out cd .. $RM -r conftest - $RM -f conftest* + $RM conftest* ]) _LT_TAGDECL([compiler_c_o], [lt_cv_prog_compiler_c_o], [1], [Does compiler simultaneously support -c and -o options?]) @@ -2400,10 +2347,6 @@ if test yes = "$GCC"; then *) lt_sed_strip_eq='s|=/|/|g' ;; esac lt_search_path_spec=`$CC -print-search-dirs | awk $lt_awk_arg | $SED -e "s/^libraries://" -e $lt_sed_strip_eq` - case $cc_basename in - fccpx* | FCCpx* ) lt_search_path_spec=`$CC --showme:libdirs` ;; - *) lt_search_path_spec=`$CC -print-search-dirs | awk $lt_awk_arg | $SED -e "s/^libraries://" -e $lt_sed_strip_eq` ;; - esac case $lt_search_path_spec in *\;*) # if the path contains ";" then we assume it to be the separator @@ -3437,19 +3380,8 @@ AC_ARG_WITH([gnu-ld], [test no = "$withval" || with_gnu_ld=yes], [with_gnu_ld=no])dnl -_LT_CC_BASENAME($CC) - ac_prog=ld -# special care for Fujitsu C or C++ compilers -if test "$cc_basename" = fccpx || test "$cc_basename" = FCCpx ; then - if test yes = "$with_gnu_ld" || test "$host_os" = linux-gnu ; then - ac_prog=`($CC -Xg -print-prog-name=ld) 2>&5` - test -z "$LD" && LD=$ac_prog - with_gnu_ld=yes - fi -fi - -if test "$ac_prog" = ld && test yes = "$GCC" ; then +if test yes = "$GCC"; then # Check if gcc -print-prog-name=ld gives a path. AC_MSG_CHECKING([for ld used by $CC]) case $host in @@ -3569,7 +3501,7 @@ case $host_os in ;; darwin*) if test yes = "$GCC"; then - reload_cmds='$LTCC $LTCFLAGS $nostdlib_flag -nostdlib $wl-r -o $output$reload_objs' + reload_cmds='$LTCC $LTCFLAGS -nostdlib $wl-r -o $output$reload_objs' else reload_cmds='$LD$reload_flag -o $output$reload_objs' fi @@ -4406,12 +4338,7 @@ m4_if([$1], [CXX], [ # AIX 5 now supports IA64 processor _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' fi - if test "$cc_basename" = FCCpx ; then # Fujitsu C++ - _LT_TAGVAR(lt_prog_compiler_pic, $1)='-Xg -KPIC' - _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' - else - _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC' - fi + _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC' ;; amigaos*) @@ -4609,12 +4536,6 @@ m4_if([$1], [CXX], [ _LT_TAGVAR(lt_prog_compiler_pic, $1)='-qpic' _LT_TAGVAR(lt_prog_compiler_static, $1)='-qstaticlink' ;; - FCCpx* ) - # Fujitsu C++ - _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' - _LT_TAGVAR(lt_prog_compiler_pic, $1)='-Xg -KPIC' - _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' - ;; *) case `$CC -V 2>&1 | $SED 5q` in *Sun\ C*) @@ -4842,10 +4763,6 @@ m4_if([$1], [CXX], [ _LT_TAGVAR(lt_prog_compiler_pic, $1)="-Xcompiler $_LT_TAGVAR(lt_prog_compiler_pic, $1)" fi ;; - fccpx*) # Fujitsu C Compiler - _LT_TAGVAR(lt_prog_compiler_pic, $1)='-Xg -KPIC' - _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' - ;; esac else # PORTME Check for flag to pass linker flags through the system compiler. @@ -4935,18 +4852,6 @@ m4_if([$1], [CXX], [ _LT_TAGVAR(lt_prog_compiler_pic, $1)='--shared' _LT_TAGVAR(lt_prog_compiler_static, $1)='--static' ;; - frtpx* ) - # Fujitsu Fortran compiler - _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' - _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC' - _LT_TAGVAR(lt_prog_compiler_static, $1)='-Kstatic_fjlib' - ;; - fccpx* | FCCpx* ) - # Fujitsu C or C++ compiler - _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,' - _LT_TAGVAR(lt_prog_compiler_pic, $1)='-Xg -KPIC' - _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic' - ;; nagfor*) # NAG Fortran compiler _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,-Wl,,' @@ -5454,8 +5359,6 @@ _LT_EOF _LT_TAGVAR(whole_archive_flag_spec, $1)= tmp_sharedflag='--shared' ;; nagfor*) # NAGFOR 5.3 - _LT_TAGVAR(whole_archive_flag_spec, $1)='$wl--whole-archive`for conv in $convenience\"\"; do test -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` $wl--no-whole-archive' - _LT_TAGVAR(compiler_needs_object, $1)=yes tmp_sharedflag='-Wl,-shared' ;; xl[[cC]]* | bgxl[[cC]]* | mpixl[[cC]]*) # IBM XL C 8.0 on PPC (deal with xlf below) tmp_sharedflag='-qmkshrobj' @@ -6344,7 +6247,7 @@ x|xyes) # to ld, don't add -lc before -lgcc. AC_CACHE_CHECK([whether -lc should be explicitly linked in], [lt_cv_]_LT_TAGVAR(archive_cmds_need_lc, $1), - [$RM -f conftest* + [$RM conftest* echo "$lt_simple_compile_test_code" > conftest.$ac_ext if AC_TRY_EVAL(ac_compile) 2>conftest.err; then @@ -6371,7 +6274,7 @@ x|xyes) else cat conftest.err 1>&5 fi - $RM -f conftest* + $RM conftest* ]) _LT_TAGVAR(archive_cmds_need_lc, $1)=$lt_cv_[]_LT_TAGVAR(archive_cmds_need_lc, $1) ;; @@ -6653,8 +6556,8 @@ if test yes != "$_lt_caught_CXX_error"; then # Check if GNU C++ uses GNU ld as the underlying linker, since the # archiving commands below assume that GNU ld is being used. if test yes = "$with_gnu_ld"; then - _LT_TAGVAR(archive_cmds, $1)='$CC $pic_flag -shared $nostdlib_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname -o $lib' - _LT_TAGVAR(archive_expsym_cmds, $1)='$CC $pic_flag -shared $nostdlib_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib' + _LT_TAGVAR(archive_cmds, $1)='$CC $pic_flag -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname -o $lib' + _LT_TAGVAR(archive_expsym_cmds, $1)='$CC $pic_flag -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname $wl-retain-symbols-file $wl$export_symbols -o $lib' _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='$wl-rpath $wl$libdir' _LT_TAGVAR(export_dynamic_flag_spec, $1)='$wl--export-dynamic' @@ -6665,8 +6568,6 @@ if test yes != "$_lt_caught_CXX_error"; then wlarc='$wl' # ancient GNU ld didn't support --whole-archive et. al. - # TODO: when using FCCpx, need to run command `$CC -Xg -print-prog-name=ld` - # to get the linker, LD. if $LD --help 2>&1 | $GREP 'no-whole-archive' > /dev/null; then _LT_TAGVAR(whole_archive_flag_spec, $1)=$wlarc'--whole-archive$convenience '$wlarc'--no-whole-archive' else @@ -6681,7 +6582,7 @@ if test yes != "$_lt_caught_CXX_error"; then # linker, instead of GNU ld. If possible, this setting should # overridden to take advantage of the native linker features on # the platform it is being used on. - _LT_TAGVAR(archive_cmds, $1)='$CC -shared $nostdlib_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $lib' + _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $lib' fi # Commands to make compiler produce verbose output that lists @@ -6952,7 +6853,7 @@ if test yes != "$_lt_caught_CXX_error"; then _LT_TAGVAR(file_list_spec, $1)='@' if $LD --help 2>&1 | $GREP 'auto-import' > /dev/null; then - _LT_TAGVAR(archive_cmds, $1)='$CC -shared $nostdlib_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname $wl--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib' + _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname $wl--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib' # If the export-symbols file already is a .def file, use it as # is; otherwise, prepend EXPORTS... _LT_TAGVAR(archive_expsym_cmds, $1)='if _LT_DLL_DEF_P([$export_symbols]); then @@ -6961,7 +6862,7 @@ if test yes != "$_lt_caught_CXX_error"; then echo EXPORTS > $output_objdir/$soname.def; cat $export_symbols >> $output_objdir/$soname.def; fi~ - $CC -shared $nostdlib_flag -nostdlib $output_objdir/$soname.def $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname $wl--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib' + $CC -shared -nostdlib $output_objdir/$soname.def $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname $wl--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib' else _LT_TAGVAR(ld_shlibs, $1)=no fi @@ -7068,7 +6969,7 @@ if test yes != "$_lt_caught_CXX_error"; then ;; *) if test yes = "$GXX"; then - _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$CC -shared $nostdlib_flag -nostdlib $pic_flag $wl+b $wl$install_libdir -o $output_objdir/$soname $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~test "x$output_objdir/$soname" = "x$lib" || mv $output_objdir/$soname $lib' + _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$CC -shared -nostdlib $pic_flag $wl+b $wl$install_libdir -o $output_objdir/$soname $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~test "x$output_objdir/$soname" = "x$lib" || mv $output_objdir/$soname $lib' else # FIXME: insert proper C++ library support _LT_TAGVAR(ld_shlibs, $1)=no @@ -7136,13 +7037,13 @@ if test yes != "$_lt_caught_CXX_error"; then if test no = "$with_gnu_ld"; then case $host_cpu in hppa*64*) - _LT_TAGVAR(archive_cmds, $1)='$CC -shared $nostdlib_flag -nostdlib -fPIC $wl+h $wl$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags' + _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib -fPIC $wl+h $wl$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags' ;; ia64*) - _LT_TAGVAR(archive_cmds, $1)='$CC -shared $nostdlib_flag -nostdlib $pic_flag $wl+h $wl$soname $wl+nodefaultrpath -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags' + _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $pic_flag $wl+h $wl$soname $wl+nodefaultrpath -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags' ;; *) - _LT_TAGVAR(archive_cmds, $1)='$CC -shared $nostdlib_flag -nostdlib $pic_flag $wl+h $wl$soname $wl+b $wl$install_libdir -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags' + _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $pic_flag $wl+h $wl$soname $wl+b $wl$install_libdir -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags' ;; esac fi @@ -7183,9 +7084,9 @@ if test yes != "$_lt_caught_CXX_error"; then *) if test yes = "$GXX"; then if test no = "$with_gnu_ld"; then - _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $nostdlib_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib' + _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib' else - _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $nostdlib_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` -o $lib' + _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` -o $lib' fi fi _LT_TAGVAR(link_all_deplibs, $1)=yes @@ -7459,10 +7360,10 @@ if test yes != "$_lt_caught_CXX_error"; then _LT_TAGVAR(allow_undefined_flag, $1)=' $wl-expect_unresolved $wl\*' case $host in osf3*) - _LT_TAGVAR(archive_cmds, $1)='$CC -shared $nostdlib_flag -nostdlib $allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib' + _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib' ;; *) - _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $nostdlib_flag -nostdlib $allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-msym $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib' + _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-msym $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations -o $lib' ;; esac @@ -7552,9 +7453,9 @@ if test yes != "$_lt_caught_CXX_error"; then if test yes,no = "$GXX,$with_gnu_ld"; then _LT_TAGVAR(no_undefined_flag, $1)=' $wl-z ${wl}defs' if $CC --version | $GREP -v '^2\.7' > /dev/null; then - _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $nostdlib_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-h $wl$soname -o $lib' + _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-h $wl$soname -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~ - $CC -shared $pic_flag $nostdlib_flag -nostdlib $wl-M $wl$lib.exp $wl-h $wl$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp' + $CC -shared $pic_flag -nostdlib $wl-M $wl$lib.exp $wl-h $wl$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp' # Commands to make compiler produce verbose output that lists # what "hidden" libraries, object files and flags are used when @@ -7563,9 +7464,9 @@ if test yes != "$_lt_caught_CXX_error"; then else # g++ 2.7 appears to require '-G' NOT '-shared' on this # platform. - _LT_TAGVAR(archive_cmds, $1)='$CC -G $nostdlib_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-h $wl$soname -o $lib' + _LT_TAGVAR(archive_cmds, $1)='$CC -G -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags $wl-h $wl$soname -o $lib' _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~ - $CC -G $nostdlib_flag -nostdlib $wl-M $wl$lib.exp $wl-h $wl$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp' + $CC -G -nostdlib $wl-M $wl$lib.exp $wl-h $wl$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp' # Commands to make compiler produce verbose output that lists # what "hidden" libraries, object files and flags are used when @@ -7843,7 +7744,7 @@ if AC_TRY_EVAL(ac_compile); then # The "-l" case would never come before the object being # linked, so don't bother handling this case. esac - elif test "x$p" != x ; then # skip if $p is empty + else if test -z "$_LT_TAGVAR(postdeps, $1)"; then _LT_TAGVAR(postdeps, $1)=$prev$p else @@ -7922,14 +7823,6 @@ _LT_TAGDECL([], [postdeps], [1]) _LT_TAGDECL([], [compiler_lib_search_path], [1], [The library search path used internally by the compiler when linking a shared library]) - -ac_nostdlib_flag= -# Fujitsu compilers -if test "$cc_basename" == FCCpx || test "$cc_basename" == fccpx || test "$cc_basename" == frtpx ; then - ac_nostdlib_flag=-Xg -fi -_LT_TAGVAR(nostdlib_flag, $1)=$ac_nostdlib_flag -_LT_TAGDECL([], [nostdlib_flag], [1]) ])# _LT_SYS_HIDDEN_LIBDEPS diff --git a/m4/list_len.m4 b/m4/list_len.m4 new file mode 100644 index 0000000000..7e6590eed7 --- /dev/null +++ b/m4/list_len.m4 @@ -0,0 +1,6 @@ +divert(`-1') +# list_len((item_1, item_2, ..., item_n)) +# parenthesized list, simple version +define(`list_len', `_list_len($@, 0)')`'dnl +define(`_list_len',`ifelse(`$1', `()', `$2', `$0((shift$1), incr(`$2'))')')`'dnl +divert`'dnl \ No newline at end of file diff --git a/m4/utils.m4 b/m4/utils.m4 index 3ef65eb059..25024922f9 100644 --- a/m4/utils.m4 +++ b/m4/utils.m4 @@ -109,6 +109,11 @@ dnl define(`CDF2_ITYPE_LIST', `text, schar, short, int, long, float, double')dnl dnl dnl +dnl dnl dnl +dnl +define(`CDF5_EXTRA_ITYPE_LIST', `uchar, ushort, uint, longlong, ulonglong')dnl +dnl +dnl define(`CollIndep', `ifelse(`$1', `_all', `NC_REQ_COLL', `NC_REQ_INDEP')')dnl define(`ReadWrite', `ifelse(`$1', `get', `NC_REQ_WR', `$1', `iget', `NC_REQ_RD', @@ -204,25 +209,25 @@ define(`IFMT',`ifelse( `$1', `longlong', `%lld', `$1', `ulonglong', `%llu')')dnl dnl -define(`PUT_VAR',`ifdef(`PNETCDF',`ncmpi_put_var_$1_all',`nc_put_var_$1')')dnl +define(`PUT_VAR',`ifdef(`PNETCDF',`ncmpi_put_var_$1$2',`nc_put_var_$1')')dnl dnl -define(`GET_VAR',`ifdef(`PNETCDF',`ncmpi_get_var_$1_all',`nc_get_var_$1')')dnl +define(`GET_VAR',`ifdef(`PNETCDF',`ncmpi_get_var_$1$2',`nc_get_var_$1')')dnl dnl -define(`PUT_VAR1',`ifdef(`PNETCDF',`ncmpi_put_var1_$1_all',`nc_put_var1_$1')')dnl +define(`PUT_VAR1',`ifdef(`PNETCDF',`ncmpi_put_var1_$1$2',`nc_put_var1_$1')')dnl dnl -define(`GET_VAR1',`ifdef(`PNETCDF',`ncmpi_get_var1_$1_all',`nc_get_var1_$1')')dnl +define(`GET_VAR1',`ifdef(`PNETCDF',`ncmpi_get_var1_$1$2',`nc_get_var1_$1')')dnl dnl -define(`PUT_VARA',`ifdef(`PNETCDF',`ncmpi_put_vara_$1_all',`nc_put_vara_$1')')dnl +define(`PUT_VARA',`ifdef(`PNETCDF',`ncmpi_put_vara_$1$2',`nc_put_vara_$1')')dnl dnl -define(`GET_VARA',`ifdef(`PNETCDF',`ncmpi_get_vara_$1_all',`nc_get_vara_$1')')dnl +define(`GET_VARA',`ifdef(`PNETCDF',`ncmpi_get_vara_$1$2',`nc_get_vara_$1')')dnl dnl -define(`PUT_VARS',`ifdef(`PNETCDF',`ncmpi_put_vars_$1_all',`nc_put_vars_$1')')dnl +define(`PUT_VARS',`ifdef(`PNETCDF',`ncmpi_put_vars_$1$2',`nc_put_vars_$1')')dnl dnl -define(`GET_VARS',`ifdef(`PNETCDF',`ncmpi_get_vars_$1_all',`nc_get_vars_$1')')dnl +define(`GET_VARS',`ifdef(`PNETCDF',`ncmpi_get_vars_$1$2',`nc_get_vars_$1')')dnl dnl -define(`PUT_VARM',`ifdef(`PNETCDF',`ncmpi_put_varm_$1_all',`nc_put_varm_$1')')dnl +define(`PUT_VARM',`ifdef(`PNETCDF',`ncmpi_put_varm_$1$2',`nc_put_varm_$1')')dnl dnl -define(`GET_VARM',`ifdef(`PNETCDF',`ncmpi_get_varm_$1_all',`nc_get_varm_$1')')dnl +define(`GET_VARM',`ifdef(`PNETCDF',`ncmpi_get_varm_$1$2',`nc_get_varm_$1')')dnl dnl define(`XTYPE_MAX',`ifelse( `$1', `text', `127', diff --git a/scripts/ltmain.sh b/scripts/ltmain.sh index acd0c1343f..3e6a3db3a5 100644 --- a/scripts/ltmain.sh +++ b/scripts/ltmain.sh @@ -8325,13 +8325,6 @@ func_mode_link () # Convert "-framework foo" to "foo.ltframework" if test -n "$inherited_linker_flags"; then tmp_inherited_linker_flags=`$ECHO "$inherited_linker_flags" | $SED 's/-framework \([^ $]*\)/\1.ltframework/g'` - - # Additionally convert " -pthread" to " -Wl,-pthread" for nagfor - func_cc_basename $CC - case $func_cc_basename_result in - nagfor*) tmp_inherited_linker_flags=`$ECHO "$tmp_inherited_linker_flags" | $SED 's/ -pthread/ -Wl,-pthread/g'` ;; - esac - for tmp_inherited_linker_flag in $tmp_inherited_linker_flags; do case " $new_inherited_linker_flags " in *" $tmp_inherited_linker_flag "*) ;; @@ -9367,8 +9360,7 @@ func_mode_link () xlcverstring="$wl-compatibility_version $wl$minor_current $wl-current_version $wl$minor_current.$revision" verstring="-compatibility_version $minor_current -current_version $minor_current.$revision" # On Darwin other compilers - func_cc_basename $CC - case $func_cc_basename_result in + case $CC in nagfor*) verstring="$wl-compatibility_version $wl$minor_current $wl-current_version $wl$minor_current.$revision" ;; @@ -9875,13 +9867,6 @@ func_mode_link () ;; esac - # Time to revert the changes made for nagfor. - func_cc_basename $CC - case $func_cc_basename_result in - nagfor*) - new_inherited_linker_flags=`$ECHO " $new_inherited_linker_flags" | $SED 's% -Wl,-pthread% -pthread%g'` ;; - esac - # move library search paths that coincide with paths to not yet # installed libraries to the beginning of the library search list new_libs= diff --git a/sneak_peek.md b/sneak_peek.md index 3635450639..cdf1e60bfa 100644 --- a/sneak_peek.md +++ b/sneak_peek.md @@ -26,7 +26,14 @@ This is essentially a placeholder for the next release note ... + none * API semantics updates - + none + + API ncmpi_inq_header_size() now can be called in the define mode. This API + returns the file header size with metadata defined by the time of the call. + This information can be helpful to pick proper values for arguments + h_minfree, v_align, v_minfree, r_align when calling ncmpi__enddef() to + allocate a sufficiently large free space for file header extent and + variable data sections to grow without moving data already stored in the + file, i.e. when adding new variables, dimensions, or attributes. + See [PR #201](https://github.com/Parallel-NetCDF/PnetCDF/pull/201). * New error code precedence + none @@ -38,7 +45,20 @@ This is essentially a placeholder for the next release note ... + none * New PnetCDF hints - + none + + 'nc_data_move_chunk_size' -- When adding new data objects into an existing + file, the data section may need to be moved to a higher file offset. The + movement is performed in chunks. This hint allows users to customized the + chunk size. The default is 1048576 bytes, i.e. 1 MiB. + See [PR #203](https://github.com/Parallel-NetCDF/PnetCDF/pull/203). + + 'nc_striping' -- When creating a new file on the Lustre file system, this + hint advises PnetCDF to set the file's striping configuration. The hint + value is either "auto" or "inherit". The former sets the striping count to + the number of compute nodes found in the MPI communicator passed to + `ncmpi_create()`. The latter makes the new file to inherit the folder's + striping settings if the folder's striping is set. This hint's default is + "auto". Hint 'nc_striping' is ignored when MPI-IO hints `striping_factor` + and `striping_unit`, are set. + See [PR #222](https://github.com/Parallel-NetCDF/PnetCDF/pull/222). * New run-time environment variables + none @@ -53,7 +73,9 @@ This is essentially a placeholder for the next release note ... + none * Bug fixes - + none + + Fix data movement when new record variables are added to an existing file + that does not change the starting offset of record variable section. + See [PR #199](https://github.com/Parallel-NetCDF/PnetCDF/pull/199). * New example programs + none @@ -62,7 +84,10 @@ This is essentially a placeholder for the next release note ... + none * New test program - + none + + test/testcases/tst_grow_data.c -- adding new variables by re-entering the + define mode multiple time, but does not cause file header extent to grow. + It also tests a case when adding a new record variable that does not change + the starting offset of the record variable section in the file. * Issues with NetCDF library + none diff --git a/src/binding/cxx/ncmpiAtt.cpp b/src/binding/cxx/ncmpiAtt.cpp index 55fd35a1cb..7432d921cd 100644 --- a/src/binding/cxx/ncmpiAtt.cpp +++ b/src/binding/cxx/ncmpiAtt.cpp @@ -29,8 +29,8 @@ NcmpiAtt::NcmpiAtt() : {} // Constructor for non-null instances. -NcmpiAtt::NcmpiAtt(bool nullObject): - nullObject(nullObject), +NcmpiAtt::NcmpiAtt(bool nullObj): + nullObject(nullObj), groupId(-1), varId(-1) {} diff --git a/src/binding/cxx/ncmpiEnumType.cpp b/src/binding/cxx/ncmpiEnumType.cpp index 1edaacddfb..04935d8014 100644 --- a/src/binding/cxx/ncmpiEnumType.cpp +++ b/src/binding/cxx/ncmpiEnumType.cpp @@ -58,8 +58,8 @@ NcmpiEnumType::NcmpiEnumType(const NcmpiGroup& grp, const string& name): // constructor -NcmpiEnumType::NcmpiEnumType(const NcmpiType& ncmpiType): - NcmpiType(ncmpiType) +NcmpiEnumType::NcmpiEnumType(const NcmpiType& xType): + NcmpiType(xType) { // check the nctype object is the base of an Enum type if(getTypeClass() != NC_ENUM) throw NcmpiException("The NcmpiType object must be the base of an Enum type.",__FILE__,__LINE__); diff --git a/src/binding/cxx/ncmpiOpaqueType.cpp b/src/binding/cxx/ncmpiOpaqueType.cpp index a270dc2c5c..c2256fd6b4 100644 --- a/src/binding/cxx/ncmpiOpaqueType.cpp +++ b/src/binding/cxx/ncmpiOpaqueType.cpp @@ -50,8 +50,8 @@ NcmpiOpaqueType::NcmpiOpaqueType(const NcmpiGroup& grp, const string& name) : // constructor -NcmpiOpaqueType::NcmpiOpaqueType(const NcmpiType& ncmpiType) : - NcmpiType(ncmpiType) +NcmpiOpaqueType::NcmpiOpaqueType(const NcmpiType& xType) : + NcmpiType(xType) { // check the nctype object is the base of a Opaque type if(getTypeClass() != NC_OPAQUE) throw NcmpiException("The NcmpiType object must be the base of an Opaque type.",__FILE__,__LINE__); diff --git a/src/binding/cxx/ncmpiVlenType.cpp b/src/binding/cxx/ncmpiVlenType.cpp index aec268b5f5..59ae86d0b2 100644 --- a/src/binding/cxx/ncmpiVlenType.cpp +++ b/src/binding/cxx/ncmpiVlenType.cpp @@ -58,8 +58,8 @@ NcmpiVlenType::NcmpiVlenType(const NcmpiGroup& grp, const string& name) : {} // constructor -NcmpiVlenType::NcmpiVlenType(const NcmpiType& ncmpiType): - NcmpiType(ncmpiType) +NcmpiVlenType::NcmpiVlenType(const NcmpiType& xType): + NcmpiType(xType) { // check the nctype object is the base of a Vlen type if(getTypeClass() != NC_VLEN) throw NcmpiException("The NcmpiType object must be the base of a Vlen type.",__FILE__,__LINE__); diff --git a/src/binding/f77/pnetcdf.inc.in b/src/binding/f77/pnetcdf.inc.in index 9a5966a4d1..725650f396 100644 --- a/src/binding/f77/pnetcdf.inc.in +++ b/src/binding/f77/pnetcdf.inc.in @@ -14,10 +14,12 @@ integer PNETCDF_VERSION_MAJOR integer PNETCDF_VERSION_MINOR integer PNETCDF_VERSION_SUB + character*16 PNETCDF_VERSION_PRE parameter (PNETCDF_VERSION_MAJOR = @PNETCDF_VERSION_MAJOR@) parameter (PNETCDF_VERSION_MINOR = @PNETCDF_VERSION_MINOR@) parameter (PNETCDF_VERSION_SUB = @PNETCDF_VERSION_SUB@) + parameter (PNETCDF_VERSION_PRE = "@PNETCDF_VERSION_PRE@") ! ! list of PnetCDF options enabled/disabled at configure time diff --git a/src/binding/f90/nfmpi_constants.fh.in b/src/binding/f90/nfmpi_constants.fh.in index ea6d31d371..1d9f6acf6c 100644 --- a/src/binding/f90/nfmpi_constants.fh.in +++ b/src/binding/f90/nfmpi_constants.fh.in @@ -16,6 +16,8 @@ PNETCDF_VERSION_MINOR = @PNETCDF_VERSION_MINOR@, & PNETCDF_VERSION_SUB = @PNETCDF_VERSION_SUB@ + character(len=16), parameter :: PNETCDF_VERSION_PRE = "@PNETCDF_VERSION_PRE@" + ! ! list of PnetCDF options enabled/disabled at configure time ! diff --git a/src/dispatchers/attr_getput.m4 b/src/dispatchers/attr_getput.m4 index dcdca8355a..ac330887ff 100644 --- a/src/dispatchers/attr_getput.m4 +++ b/src/dispatchers/attr_getput.m4 @@ -198,12 +198,10 @@ check_consistency_put(MPI_Comm comm, /* check if buf contents is consistent across all processes */ if (root_nelems > 0) { /* non-scalar attribute */ /* note xsz is aligned, thus must use the exact size of buf */ - int rank, itype_size; + int itype_size; size_t buf_size; void *root_buf; - MPI_Comm_rank(comm, &rank); - /* for attributes, itype is nc_type, so its size is small. Thus, no * need to check against NC_MAX_INT. */ diff --git a/src/dispatchers/cdl_header_parser.c b/src/dispatchers/cdl_header_parser.c index 421999e85b..d5efc2e781 100644 --- a/src/dispatchers/cdl_header_parser.c +++ b/src/dispatchers/cdl_header_parser.c @@ -720,7 +720,7 @@ int cdl_hdr_open(const char *filename, return NC_EFILE; } rlen = fread(fbuf, 1, file_size, fptr); - if (rlen < 0) { + if (file_size > 0 && rlen == 0) { printf("Error in %s at %d: fail to fread file %s (%s)\n", __func__,__LINE__,filename,strerror(errno)); return NC_EFILE; diff --git a/src/dispatchers/error_codes.c b/src/dispatchers/error_codes.c index 1a71538f0e..b78c447d4d 100644 --- a/src/dispatchers/error_codes.c +++ b/src/dispatchers/error_codes.c @@ -289,12 +289,16 @@ ncmpi_strerror(int err) return "Variable fill value is inconsistent among processes."; case NC_EMULTIDEFINE_CMODE: return "File create mode is inconsistent among processes."; + case NC_EMULTIDEFINE_HINTS: + return "I/O hints are not consistent among processes."; case NC_EBADLOG: return "Unrecognized burst buffering log file format."; case NC_EFLUSHED: return "Nonblocking requests already flushed."; case NC_EADIOS: return "unknown ADIOS error."; + case NC_EFSTYPE: + return "Invalid file system type."; default: /* check netCDF-3 and netCDF-4 errors */ @@ -719,6 +723,7 @@ ncmpi_strerrno(int err) case (NC_EBADLOG): return "NC_EBADLOG"; case (NC_EFLUSHED): return "NC_EFLUSHED"; case (NC_EADIOS): return "NC_EADIOS"; + case (NC_EFSTYPE): return "NC_EFSTYPE"; case (NC_EMULTIDEFINE): return "NC_EMULTIDEFINE"; case (NC_EMULTIDEFINE_OMODE): return "NC_EMULTIDEFINE_OMODE"; @@ -744,6 +749,8 @@ ncmpi_strerrno(int err) case (NC_EMULTIDEFINE_VAR_FILL_MODE): return "NC_EMULTIDEFINE_VAR_FILL_MODE"; case (NC_EMULTIDEFINE_VAR_FILL_VALUE): return "NC_EMULTIDEFINE_VAR_FILL_VALUE"; case (NC_EMULTIDEFINE_CMODE): return "NC_EMULTIDEFINE_CMODE"; + case (NC_EMULTIDEFINE_HINTS): return "NC_EMULTIDEFINE_HINTS"; + default: sprintf(unknown_str,"Unknown code %d",err); } diff --git a/src/dispatchers/file.c b/src/dispatchers/file.c index b68782038b..68b3239b47 100644 --- a/src/dispatchers/file.c +++ b/src/dispatchers/file.c @@ -39,7 +39,7 @@ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; #define BUFREAD64(buf,var) memcpy(&var, buf, 8); if (diff_endian) swap_64(&var); #endif -/* Note accessing the following 3 global variables must be protected by a +/* Note accessing the following 5 global variables must be protected by a * mutex, otherwise it will not be thread safe. */ @@ -52,6 +52,12 @@ static int pnc_numfiles; */ static int ncmpi_default_create_format = NC_FORMAT_CLASSIC; +/* attribute to be cached in all communicators */ +static int pncio_node_ids_keyval = MPI_KEYVAL_INVALID; + +/* attribute to be cached in MPI_COMM_SELF */ +static int pncio_init_keyval = MPI_KEYVAL_INVALID; + #define NCMPII_HANDLE_ERROR(func) \ if (mpireturn != MPI_SUCCESS) { \ int errorStringLen; \ @@ -69,6 +75,146 @@ static int ncmpi_default_create_format = NC_FORMAT_CLASSIC; } \ } +/* struct PNCIO_node_ids is defined in dispatch.h */ + +/*----< PNCIO_node_ids_copy() >----------------------------------------------*/ +/* A function to be invoked when a communicator is duplicated, which adds a + * reference to the already allocated memory space storing node ID array. + */ +static +int PNCIO_node_ids_copy(MPI_Comm comm, + int keyval, + void *extra, + void *attr_inP, + void *attr_outP, + int *flag) +{ + PNCIO_node_ids *attr_in = (PNCIO_node_ids*) attr_inP; + PNCIO_node_ids **attr_out = (PNCIO_node_ids**)attr_outP; + + if (attr_in == NULL) + return MPI_ERR_KEYVAL; + else + attr_in->ref_count++; + + *attr_out = attr_in; + + *flag = 1; /* make a copy in the new communicator */ + + return MPI_SUCCESS; +} + +/*----< PNCIO_node_ids_delete() >--------------------------------------------*/ +/* Callback function to be called when a communicator is freed, which frees the + * allocated memory space of node ID array. + */ +static +int PNCIO_node_ids_delete(MPI_Comm comm, + int keyval, + void *attr_val, + void *extra) +{ + PNCIO_node_ids *node_ids = (PNCIO_node_ids*) attr_val; + + if (node_ids == NULL) + return MPI_ERR_KEYVAL; + else + node_ids->ref_count--; + + if (node_ids->ref_count <= 0) { + /* free the allocated array */ + if (node_ids->ids != NULL) + free(node_ids->ids); + free(node_ids); + } + return MPI_SUCCESS; +} + +/*----< PNCIO_end_call() >---------------------------------------------------*/ +/* Callback function to be called at MPI_Finalize(), which frees all cached + * attributes. + */ +static +int PNCIO_end_call(MPI_Comm comm, + int keyval, + void *attribute_val, + void *extra_state) +{ + /* Free all keyvals used by PnetCDF */ + + MPI_Comm_free_keyval(&keyval); /* free pncio_init_keyval */ + + if (pncio_node_ids_keyval != MPI_KEYVAL_INVALID) + MPI_Comm_free_keyval(&pncio_node_ids_keyval); + + return MPI_SUCCESS; +} + +/*----< set_get_comm_attr() >------------------------------------------------*/ +/* Create/set/get attributes into/from the MPI communicators passed in from the + * user application. + */ +static +void set_get_comm_attr(MPI_Comm comm, + PNCIO_node_ids *node_idsP) +{ + PNCIO_node_ids *node_ids; + + if (pncio_init_keyval == MPI_KEYVAL_INVALID) { + /* This is the first call ever to PnetCDF API. Creating key + * pncio_init_keyval is necessary for MPI_Finalize() to free key + * pncio_node_ids_keyval. + */ + MPI_Comm_create_keyval(MPI_NULL_COPY_FN, PNCIO_end_call, + &pncio_init_keyval, (void*)0); + MPI_Comm_set_attr(MPI_COMM_SELF, pncio_init_keyval, (void*)0); + } + + if (pncio_node_ids_keyval == MPI_KEYVAL_INVALID) { + MPI_Comm_create_keyval(PNCIO_node_ids_copy, PNCIO_node_ids_delete, + &pncio_node_ids_keyval, NULL); + /* ignore error, as it is not a critical error */ + } + + if (pncio_node_ids_keyval != MPI_KEYVAL_INVALID) { + int found, nprocs; + + MPI_Comm_get_attr(comm, pncio_node_ids_keyval, &node_ids, &found); + if (!found) { + /* Construct an array storing node IDs of all processes. Note the + * memory allocated for node_ids will be freed by + * PNCIO_node_ids_delete(), a callback function invoked when the + * MPI communicator is freed. + */ + node_ids = (PNCIO_node_ids*) malloc(sizeof(PNCIO_node_ids)); + node_ids->ref_count = 1; + + MPI_Comm_size(comm, &nprocs); + if (nprocs == 1) { + node_ids->num_nodes = 1; + node_ids->ids = (int*) malloc(sizeof(int)); + node_ids->ids[0] = 0; + } + else { + /* Constructing node IDs requires communication calls to + * MPI_Get_processor_name(), MPI_Gather(), and MPI_Bcast(). + */ + ncmpii_construct_node_list(comm, &node_ids->num_nodes, + &node_ids->ids); + } + + /* FYI. The same key pncio_node_ids_keyval can be added to + * different MPI communicators with same or different values. + */ + MPI_Comm_set_attr(comm, pncio_node_ids_keyval, node_ids); + } + /* else case: returned node_ids contains the cached value */ + + /* copy contents */ + *node_idsP = *node_ids; + } +} + /*----< new_id_PNCList() >---------------------------------------------------*/ /* Return a new ID (array index) from the PNC list, pnc_filelist[] that is * not used. Note the used elements in pnc_filelist[] may not be contiguous. @@ -166,6 +312,7 @@ combine_env_hints(MPI_Info user_info, /* IN */ { char *warn_str="Warning: skip ill-formed hint set in PNETCDF_HINTS"; char *env_str; + char *hdr_align_val=NULL, *var_align_val=NULL; /* take hints from the environment variable PNETCDF_HINTS, a string of * hints separated by ";" and each hint is in the form of hint=value. E.g. @@ -182,17 +329,17 @@ combine_env_hints(MPI_Info user_info, /* IN */ if ((env_str = getenv("PNETCDF_HINTS")) != NULL) { #ifdef USE_STRTOK_R char *env_str_cpy, *env_str_saved, *hint, *key; - env_str_cpy = strdup(env_str); + env_str_cpy = NCI_Strdup(env_str); env_str_saved = env_str_cpy; hint = strtok_r(env_str_cpy, ";", &env_str_saved); while (hint != NULL) { - char *hint_saved = strdup(hint); + char *hint_saved = NCI_Strdup(hint); char *val = strchr(hint, '='); if (val == NULL) { /* ill-formed hint */ if (NULL != strtok(hint, " \t")) printf("%s: '%s'\n", warn_str, hint_saved); /* else case: ignore white-spaced hints */ - free(hint_saved); + NCI_Free(hint_saved); hint = strtok_r(NULL, ";", &env_str_saved); /* get next hint */ continue; } @@ -203,18 +350,24 @@ combine_env_hints(MPI_Info user_info, /* IN */ else { if (*new_info == MPI_INFO_NULL) MPI_Info_create(new_info); /* ignore error */ - MPI_Info_set(*new_info, key, val); /* override or add */ + + if (!strcmp(key, "nc_header_align_size")) + hdr_align_val = NCI_Strdup(val); + else if (!strcmp(key, "nc_var_align_size")) + var_align_val = NCI_Strdup(val); + else + MPI_Info_set(*new_info, key, val); /* override or add */ } /* printf("env hint: key=%s val=%s\n",key,val); */ hint = strtok_r(NULL, ";", &env_str_saved); - free(hint_saved); + NCI_Free(hint_saved); } - free(env_str_cpy); + NCI_Free(env_str_cpy); #else char *env_str_cpy, *hint, *next_hint, *key, *val, *deli; char *hint_saved=NULL; - env_str_cpy = strdup(env_str); + env_str_cpy = NCI_Strdup(env_str); next_hint = env_str_cpy; do { @@ -225,14 +378,14 @@ combine_env_hints(MPI_Info user_info, /* IN */ next_hint = deli + 1; } else next_hint = "\0"; - if (hint_saved != NULL) free(hint_saved); + if (hint_saved != NULL) NCI_Free(hint_saved); /* skip all-blank hint */ - hint_saved = strdup(hint); + hint_saved = NCI_Strdup(hint); if (strtok(hint, " \t") == NULL) continue; - free(hint_saved); - hint_saved = strdup(hint); /* save hint for error message */ + NCI_Free(hint_saved); + hint_saved = NCI_Strdup(hint); /* save hint for error message */ deli = strchr(hint, '='); if (deli == NULL) { /* ill-formed hint */ @@ -257,15 +410,73 @@ combine_env_hints(MPI_Info user_info, /* IN */ } if (*new_info == MPI_INFO_NULL) MPI_Info_create(new_info); /* ignore error */ - MPI_Info_set(*new_info, key, val); /* override or add */ + + if (!strcmp(key, "nc_header_align_size")) + hdr_align_val = NCI_Strdup(val); + else if (!strcmp(key, "nc_var_align_size")) + var_align_val = NCI_Strdup(val); + else + MPI_Info_set(*new_info, key, val); /* override or add */ } while (*next_hint != '\0'); - if (hint_saved != NULL) free(hint_saved); - free(env_str_cpy); + if (hint_saved != NULL) NCI_Free(hint_saved); + NCI_Free(env_str_cpy); #endif + + /* nc_var_align_size supersedes nc_header_align_size */ + if (var_align_val != NULL) { + MPI_Info_set(*new_info, "nc_var_align_size", var_align_val); + MPI_Info_set(*new_info, "nc_header_align_size", var_align_val); + } + else if (hdr_align_val != NULL) { + MPI_Info_set(*new_info, "nc_var_align_size", hdr_align_val); + MPI_Info_set(*new_info, "nc_header_align_size", hdr_align_val); + } } /* return no error as all hints are advisory */ + + if (hdr_align_val != NULL) NCI_Free(hdr_align_val); + if (var_align_val != NULL) NCI_Free(var_align_val); + +} + +/*----< set_env_mode() >-----------------------------------------------------*/ +static +void set_env_mode(int *env_mode) +{ + char *env_str; + +#ifdef PNETCDF_DEBUG + fSet(*env_mode, NC_MODE_SAFE); + /* When debug mode is enabled at the configure time, safe mode is by + * default enabled. This can be overwritten by the run-time environment + * variable PNETCDF_SAFE_MODE. + */ +#endif + /* get environment variable PNETCDF_SAFE_MODE + * if it is set to 1, then we perform a strict parameter consistent test + */ + if ((env_str = getenv("PNETCDF_SAFE_MODE")) != NULL) { + if (*env_str == '0') fClr(*env_mode, NC_MODE_SAFE); + else fSet(*env_mode, NC_MODE_SAFE); + /* if PNETCDF_SAFE_MODE is set but without a value, *env_str can + * be '\0' (null character). In this case, safe mode is enabled */ + } + + /* get environment variable PNETCDF_RELAX_COORD_BOUND + * if it is set to 0, then we perform a strict start bound check + */ +#ifdef RELAX_COORD_BOUND + fSet(*env_mode, NC_MODE_STRICT_COORD_BOUND); +#endif + if ((env_str = getenv("PNETCDF_RELAX_COORD_BOUND")) != NULL) { + if (*env_str == '0') fClr(*env_mode, NC_MODE_STRICT_COORD_BOUND); + else fSet(*env_mode, NC_MODE_STRICT_COORD_BOUND); + /* if PNETCDF_RELAX_COORD_BOUND is set but without a value, *env_str + * can be '\0' (null character). This is equivalent to setting + * PNETCDF_RELAX_COORD_BOUND to 1 */ + } } /*----< ncmpi_create() >-----------------------------------------------------*/ @@ -278,53 +489,45 @@ ncmpi_create(MPI_Comm comm, int *ncidp) { int rank, nprocs, status=NC_NOERR, err; - int safe_mode=0, mpireturn, relax_coord_bound, format; - char *env_str; + int env_mode=0, mpireturn, format; MPI_Info combined_info; void *ncp; PNC *pncp; PNC_driver *driver; + PNCIO_node_ids node_ids; #ifdef BUILD_DRIVER_FOO int enable_foo_driver=0; #endif #ifdef ENABLE_BURST_BUFFER int enable_bb_driver=0; #endif +#ifdef ENABLE_CHUNKING + int enable_chk_driver=0; +#endif MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &nprocs); -#ifdef PNETCDF_DEBUG - safe_mode = 1; - /* When debug mode is enabled at the configure time, safe_mode is by - * default enabled. This can be overwritten by the run-time environment - * variable PNETCDF_SAFE_MODE */ +#ifdef ENABLE_THREAD_SAFE + int perr; + perr = pthread_mutex_lock(&lock); + if (perr != 0) + printf("Warning in file %s line %d: pthread_mutex_lock() failed (%s)\n", + __FILE__, __LINE__, strerror(perr)); #endif - /* get environment variable PNETCDF_SAFE_MODE - * if it is set to 1, then we perform a strict parameter consistent test - */ - if ((env_str = getenv("PNETCDF_SAFE_MODE")) != NULL) { - if (*env_str == '0') safe_mode = 0; - else safe_mode = 1; - /* if PNETCDF_SAFE_MODE is set but without a value, *env_str can - * be '\0' (null character). In this case, safe_mode is enabled */ - } - /* get environment variable PNETCDF_RELAX_COORD_BOUND - * if it is set to 0, then we perform a strict start bound check - */ -#ifndef RELAX_COORD_BOUND - relax_coord_bound = 0; -#else - relax_coord_bound = 1; + /* creating communicator attributes must be protected by a mutex */ + set_get_comm_attr(comm, &node_ids); + +#ifdef ENABLE_THREAD_SAFE + perr = pthread_mutex_unlock(&lock); + if (perr != 0) + printf("Warning in file %s line %d: pthread_mutex_unlock() failed (%s)\n", + __FILE__, __LINE__, strerror(perr)); #endif - if ((env_str = getenv("PNETCDF_RELAX_COORD_BOUND")) != NULL) { - if (*env_str == '0') relax_coord_bound = 0; - else relax_coord_bound = 1; - /* if PNETCDF_RELAX_COORD_BOUND is set but without a value, *env_str - * can be '\0' (null character). This is equivalent to setting - * relax_coord_bound to 1 */ - } + + if (rank == 0) + set_env_mode(&env_mode); /* path's validity is checked in MPI-IO with error code MPI_ERR_BAD_FILE * path consistency is checked in MPI-IO with error code MPI_ERR_NOT_SAME @@ -332,17 +535,19 @@ ncmpi_create(MPI_Comm comm, if (path == NULL || *path == '\0') DEBUG_RETURN_ERROR(NC_EBAD_FILE) if (nprocs > 1) { /* Check cmode consistency */ - int root_cmode = cmode; /* only root's matters */ - TRACE_COMM(MPI_Bcast)(&root_cmode, 1, MPI_INT, 0, comm); + int modes[2] = {cmode, env_mode}; /* only root's matters */ + + TRACE_COMM(MPI_Bcast)(&modes, 2, MPI_INT, 0, comm); NCMPII_HANDLE_ERROR("MPI_Bcast") /* Overwrite cmode with root's cmode */ - if (root_cmode != cmode) { - cmode = root_cmode; + if (modes[0] != cmode) { + cmode = modes[0]; DEBUG_ASSIGN_ERROR(status, NC_EMULTIDEFINE_CMODE) } - if (safe_mode) { /* sync status among all processes */ + env_mode = modes[1]; + if (fIsSet(env_mode, NC_MODE_SAFE)) { /* sync status among all processes */ err = status; TRACE_COMM(MPI_Allreduce)(&err, &status, 1, MPI_INT, MPI_MIN, comm); NCMPII_HANDLE_ERROR("MPI_Allreduce") @@ -382,6 +587,18 @@ ncmpi_create(MPI_Comm comm, enable_bb_driver = 1; } #endif +#ifdef ENABLE_CHUNKING + if (combined_info != MPI_INFO_NULL) { + char value[MPI_MAX_INFO_VAL]; + int flag; + + /* check if nc_chunking is enabled */ + MPI_Info_get(combined_info, "nc_chunking", MPI_MAX_INFO_VAL-1, + value, &flag); + if (flag && strcasecmp(value, "enable") == 0) + enable_chk_driver = 1; + } +#endif /* Use environment variable and cmode to tell the file format * which is later used to select the right driver. @@ -462,6 +679,11 @@ ncmpi_create(MPI_Comm comm, if (enable_bb_driver) driver = ncbbio_inq_driver(); else +#endif +#ifdef ENABLE_CHUNKING + if (enable_chk_driver) + driver = ncchkio_inq_driver(); + else #endif /* default is the driver built on top of MPI-IO */ driver = ncmpio_inq_driver(); @@ -475,6 +697,9 @@ ncmpi_create(MPI_Comm comm, DEBUG_RETURN_ERROR(NC_ENOMEM) } + pncp->flag = NC_MODE_DEF | NC_MODE_CREATE; + fSet(pncp->flag, env_mode); + /* generate a new nc file ID from NCPList */ err = new_id_PNCList(ncidp, pncp); if (err != NC_NOERR) { @@ -483,9 +708,12 @@ ncmpi_create(MPI_Comm comm, return err; } - /* Duplicate comm, because users may free it (though unlikely). Note - * MPI_Comm_dup() is collective. We pass pncp->comm to drivers, so there - * is no need for a driver to duplicate it again. + /* Duplicate comm, because users may use it doing other point-to-point + * communication. When this happened, that communication can mess up with + * the PnetCDF/MPI-IO internal communication, particularly when in + * independent data mode. Note MPI_Comm_dup() is collective. We pass + * pncp->comm to drivers, so there is no need for a driver to duplicate it + * again. */ if (comm != MPI_COMM_WORLD && comm != MPI_COMM_SELF) { mpireturn = MPI_Comm_dup(comm, &pncp->comm); @@ -495,31 +723,26 @@ ncmpi_create(MPI_Comm comm, else pncp->comm = comm; + /* fill in pncp members */ + pncp->path = (char*) NCI_Strdup(path); + if (pncp->path == NULL) + DEBUG_RETURN_ERROR(NC_ENOMEM) + /* calling the driver's create subroutine */ - err = driver->create(pncp->comm, path, cmode, *ncidp, combined_info, &ncp); + err = driver->create(pncp->comm, pncp->path, cmode, *ncidp, env_mode, + combined_info, node_ids, &ncp); if (status == NC_NOERR) status = err; if (combined_info != MPI_INFO_NULL) MPI_Info_free(&combined_info); if (status != NC_NOERR && status != NC_EMULTIDEFINE_CMODE) { del_from_PNCList(*ncidp); if (pncp->comm != MPI_COMM_WORLD && pncp->comm != MPI_COMM_SELF) MPI_Comm_free(&pncp->comm); /* a collective call */ + NCI_Free(pncp->path); NCI_Free(pncp); *ncidp = -1; return status; } - /* fill in pncp members */ - pncp->path = (char*) NCI_Malloc(strlen(path)+1); - if (pncp->path == NULL) { - driver->close(ncp); /* close file and ignore error */ - del_from_PNCList(*ncidp); - if (pncp->comm != MPI_COMM_WORLD && pncp->comm != MPI_COMM_SELF) - MPI_Comm_free(&pncp->comm); /* a collective call */ - NCI_Free(pncp); - *ncidp = -1; - DEBUG_RETURN_ERROR(NC_ENOMEM) - } - strcpy(pncp->path, path); pncp->mode = cmode; pncp->driver = driver; pncp->ndims = 0; @@ -527,12 +750,14 @@ ncmpi_create(MPI_Comm comm, pncp->nvars = 0; pncp->nrec_vars = 0; pncp->vars = NULL; - pncp->flag = NC_MODE_DEF | NC_MODE_CREATE; pncp->ncp = ncp; pncp->format = format; - if (safe_mode) pncp->flag |= NC_MODE_SAFE; - if (!relax_coord_bound) pncp->flag |= NC_MODE_STRICT_COORD_BOUND; + if (fIsSet(env_mode, NC_MODE_SAFE)) + pncp->flag |= NC_MODE_SAFE; + + if (fIsSet(env_mode, NC_MODE_STRICT_COORD_BOUND)) + pncp->flag |= NC_MODE_STRICT_COORD_BOUND; return status; } @@ -549,53 +774,45 @@ ncmpi_open(MPI_Comm comm, int *ncidp) /* OUT */ { int i, j, nalloc, rank, nprocs, format, status=NC_NOERR, err; - int safe_mode=0, mpireturn, relax_coord_bound, DIMIDS[NDIMS_], *dimids; - char *env_str; + int env_mode=0, mpireturn, DIMIDS[NDIMS_], *dimids; MPI_Info combined_info; void *ncp; PNC *pncp; PNC_driver *driver; + PNCIO_node_ids node_ids; #ifdef BUILD_DRIVER_FOO int enable_foo_driver=0; #endif #ifdef ENABLE_BURST_BUFFER int enable_bb_driver=0; #endif +#ifdef ENABLE_CHUNKING + int enable_chk_driver=0; +#endif MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &nprocs); -#ifdef PNETCDF_DEBUG - safe_mode = 1; - /* When debug mode is enabled at the configure time, safe_mode is by - * default enabled. This can be overwritten by the run-time environment - * variable PNETCDF_SAFE_MODE */ +#ifdef ENABLE_THREAD_SAFE + int perr; + perr = pthread_mutex_lock(&lock); + if (perr != 0) + printf("Warning in file %s line %d: pthread_mutex_lock() failed (%s)\n", + __FILE__, __LINE__, strerror(perr)); #endif - /* get environment variable PNETCDF_SAFE_MODE - * if it is set to 1, then we perform a strict parameter consistent test - */ - if ((env_str = getenv("PNETCDF_SAFE_MODE")) != NULL) { - if (*env_str == '0') safe_mode = 0; - else safe_mode = 1; - /* if PNETCDF_SAFE_MODE is set but without a value, *env_str can - * be '\0' (null character). In this case, safe_mode is enabled */ - } - /* get environment variable PNETCDF_RELAX_COORD_BOUND - * if it is set to 0, then we perform a strict start bound check - */ -#ifndef RELAX_COORD_BOUND - relax_coord_bound = 0; -#else - relax_coord_bound = 1; + /* creating communicator attributes must be protected by a mutex */ + set_get_comm_attr(comm, &node_ids); + +#ifdef ENABLE_THREAD_SAFE + perr = pthread_mutex_unlock(&lock); + if (perr != 0) + printf("Warning in file %s line %d: pthread_mutex_unlock() failed (%s)\n", + __FILE__, __LINE__, strerror(perr)); #endif - if ((env_str = getenv("PNETCDF_RELAX_COORD_BOUND")) != NULL) { - if (*env_str == '0') relax_coord_bound = 0; - else relax_coord_bound = 1; - /* if PNETCDF_RELAX_COORD_BOUND is set but without a value, *env_str - * can be '\0' (null character). This is equivalent to setting - * relax_coord_bound to 1 */ - } + + if (rank == 0) + set_env_mode(&env_mode); /* path's validity is checked in MPI-IO with error code MPI_ERR_BAD_FILE * path consistency is checked in MPI-IO with error code MPI_ERR_NOT_SAME @@ -625,32 +842,29 @@ ncmpi_open(MPI_Comm comm, } if (nprocs > 1) { /* root broadcasts format and omode */ - int root_omode, msg[2]; - - msg[0] = format; /* only root's matters (format or error code) */ + int modes[3] = {format, omode, env_mode}; - /* Check omode consistency: + /* Check consistency: + * Note only root's values matter, format, omode, env_mode. * Note if omode contains NC_NOWRITE, it is equivalent to NC_CLOBBER. * In pnetcdf.h, they both are defined the same value, 0. - * Only root's omode matters. */ - msg[1] = omode; /* only root's matters */ - TRACE_COMM(MPI_Bcast)(&msg, 2, MPI_INT, 0, comm); + TRACE_COMM(MPI_Bcast)(&modes, 3, MPI_INT, 0, comm); NCMPII_HANDLE_ERROR("MPI_Bcast") /* check format error (a fatal error, must return now) */ - format = msg[0]; + format = modes[0]; if (format < 0) return format; /* all netCDF errors are negative */ /* check omode consistency */ - root_omode = msg[1]; - if (root_omode != omode) { - omode = root_omode; + if (modes[1] != omode) { + omode = modes[1]; DEBUG_ASSIGN_ERROR(status, NC_EMULTIDEFINE_OMODE) } - if (safe_mode) { /* sync status among all processes */ + env_mode = modes[2]; + if (fIsSet(env_mode, NC_MODE_SAFE)) { /* sync status among all processes */ err = status; TRACE_COMM(MPI_Allreduce)(&err, &status, 1, MPI_INT, MPI_MIN, comm); NCMPII_HANDLE_ERROR("MPI_Allreduce") @@ -691,6 +905,18 @@ ncmpi_open(MPI_Comm comm, enable_bb_driver = 1; } #endif +#ifdef ENABLE_CHUNKING + if (combined_info != MPI_INFO_NULL) { + char value[MPI_MAX_INFO_VAL]; + int flag; + + /* check if nc_chunking is enabled */ + MPI_Info_get(combined_info, "nc_chunking", MPI_MAX_INFO_VAL-1, + value, &flag); + if (flag && strcasecmp(value, "enable") == 0) + enable_chk_driver = 1; + } +#endif #ifdef ENABLE_NETCDF4 if (format == NC_FORMAT_NETCDF4_CLASSIC || format == NC_FORMAT_NETCDF4) { @@ -719,6 +945,11 @@ ncmpi_open(MPI_Comm comm, if (enable_bb_driver) driver = ncbbio_inq_driver(); else +#endif +#ifdef ENABLE_CHUNKING + if (enable_chk_driver) + driver = ncchkio_inq_driver(); + else #endif { /* ncmpio driver */ @@ -743,13 +974,19 @@ ncmpi_open(MPI_Comm comm, DEBUG_RETURN_ERROR(NC_ENOMEM) } + pncp->flag = 0; + fSet(pncp->flag, env_mode); + /* generate a new nc file ID from NCPList */ err = new_id_PNCList(ncidp, pncp); if (err != NC_NOERR) return err; - /* Duplicate comm, because users may free it (though unlikely). Note - * MPI_Comm_dup() is collective. We pass pncp->comm to drivers, so there - * is no need for a driver to duplicate it again. + /* Duplicate comm, because users may use it doing other point-to-point + * communication. When this happened, that communication can mess up with + * the PnetCDF/MPI-IO internal communication, particularly when in + * independent data mode. Note MPI_Comm_dup() is collective. We pass + * pncp->comm to drivers, so there is no need for a driver to duplicate it + * again. */ if (comm != MPI_COMM_WORLD && comm != MPI_COMM_SELF) { mpireturn = MPI_Comm_dup(comm, &pncp->comm); @@ -759,8 +996,13 @@ ncmpi_open(MPI_Comm comm, else pncp->comm = comm; + pncp->path = (char*) NCI_Strdup(path); + if (pncp->path == NULL) + DEBUG_RETURN_ERROR(NC_ENOMEM) + /* calling the driver's open subroutine */ - err = driver->open(pncp->comm, path, omode, *ncidp, combined_info, &ncp); + err = driver->open(pncp->comm, pncp->path, omode, *ncidp, env_mode, + combined_info, node_ids, &ncp); if (status == NC_NOERR) status = err; if (combined_info != MPI_INFO_NULL) MPI_Info_free(&combined_info); if (status != NC_NOERR && status != NC_EMULTIDEFINE_OMODE && @@ -770,23 +1012,13 @@ ncmpi_open(MPI_Comm comm, del_from_PNCList(*ncidp); if (pncp->comm != MPI_COMM_WORLD && pncp->comm != MPI_COMM_SELF) MPI_Comm_free(&pncp->comm); /* a collective call */ + NCI_Free(pncp->path); NCI_Free(pncp); *ncidp = -1; return status; } /* fill in pncp members */ - pncp->path = (char*) NCI_Malloc(strlen(path)+1); - if (pncp->path == NULL) { - driver->close(ncp); /* close file and ignore error */ - del_from_PNCList(*ncidp); - if (pncp->comm != MPI_COMM_WORLD && pncp->comm != MPI_COMM_SELF) - MPI_Comm_free(&pncp->comm); /* a collective call */ - NCI_Free(pncp); - *ncidp = -1; - DEBUG_RETURN_ERROR(NC_ENOMEM) - } - strcpy(pncp->path, path); pncp->mode = omode; pncp->driver = driver; pncp->ndims = 0; @@ -794,13 +1026,17 @@ ncmpi_open(MPI_Comm comm, pncp->nvars = 0; pncp->nrec_vars = 0; pncp->vars = NULL; - pncp->flag = 0; pncp->ncp = ncp; pncp->format = format; - if (!fIsSet(omode, NC_WRITE)) pncp->flag |= NC_MODE_RDONLY; - if (safe_mode) pncp->flag |= NC_MODE_SAFE; - if (!relax_coord_bound) pncp->flag |= NC_MODE_STRICT_COORD_BOUND; + if (!fIsSet(omode, NC_WRITE)) + pncp->flag |= NC_MODE_RDONLY; + + if (fIsSet(env_mode, NC_MODE_SAFE)) + pncp->flag |= NC_MODE_SAFE; + + if (fIsSet(env_mode, NC_MODE_STRICT_COORD_BOUND)) + pncp->flag |= NC_MODE_STRICT_COORD_BOUND; /* inquire number of dimensions, variables defined and rec dim ID */ err = driver->inq(pncp->ncp, &pncp->ndims, &pncp->nvars, NULL, @@ -1251,9 +1487,8 @@ ncmpi_inq_file_format(const char *filename, __func__,__LINE__,filename); DEBUG_RETURN_ERROR(NC_EFILE) } - if (close(fd) == -1) { + if (close(fd) == -1) DEBUG_RETURN_ERROR(NC_EFILE) - } if (memcmp(signature, cdf_signature, 3) == 0) { if (signature[3] == 5) *formatp = NC_FORMAT_CDF5; diff --git a/src/dispatchers/var_getput.m4 b/src/dispatchers/var_getput.m4 index f2757a9fed..638f2d6c97 100644 --- a/src/dispatchers/var_getput.m4 +++ b/src/dispatchers/var_getput.m4 @@ -56,23 +56,24 @@ define(`GOTO_CHECK',`{ DEBUG_ASSIGN_ERROR(err, $1) goto err_check; }')dnl } +#include /*----< check_EINVALCOORDS() >-----------------------------------------------*/ static -int check_EINVALCOORDS(int strict_coord_bound, +int check_EINVALCOORDS(int relax_coord_bound, MPI_Offset start, MPI_Offset count, MPI_Offset shape) { - if (strict_coord_bound) { - if (start < 0 || start >= shape) - DEBUG_RETURN_ERROR(NC_EINVALCOORDS) - } - else { + if (relax_coord_bound) { if (start < 0 || start > shape) DEBUG_RETURN_ERROR(NC_EINVALCOORDS) if (start == shape && count > 0) DEBUG_RETURN_ERROR(NC_EINVALCOORDS) } + else { + if (start < 0 || start >= shape) + DEBUG_RETURN_ERROR(NC_EINVALCOORDS) + } return NC_NOERR; } @@ -116,11 +117,11 @@ int check_EEDGE(const MPI_Offset *start, pncp->driver->inq_var(pncp->ncp, varid, name, NULL, NULL, \ NULL, NULL, NULL, NULL, NULL); \ if (stride != NULL) \ - fprintf(stderr, "Rank %d: NC_EEDGE variable %s: shape[%d]="OFFFMT" but start[%d]="OFFFMT" count[%d]="OFFFMT" stride[%d]="OFFFMT"\n", \ - _rank, name, dim, shape[dim], dim, start[dim], dim, count[dim], dim, stride[dim]); \ + fprintf(stderr, "Rank %d in %s (%d): NC_EEDGE variable %s: shape[%d]="OFFFMT" but start[%d]="OFFFMT" count[%d]="OFFFMT" stride[%d]="OFFFMT"\n", \ + _rank, __func__, __LINE__, name, dim, shape[dim], dim, start[dim], dim, count[dim], dim, stride[dim]); \ else \ - fprintf(stderr, "Rank %d: NC_EEDGE variable %s: shape[%d]="OFFFMT" but start[%d]="OFFFMT" count[%d]="OFFFMT"\n", \ - _rank, name, dim, shape[dim], dim, start[dim], dim, count[dim]); \ + fprintf(stderr, "Rank %d in %s (%d): NC_EEDGE variable %s: shape[%d]="OFFFMT" but start[%d]="OFFFMT" count[%d]="OFFFMT"\n", \ + _rank, __func__, __LINE__, name, dim, shape[dim], dim, start[dim], dim, count[dim]); \ } \ } \ } @@ -170,7 +171,7 @@ int check_start_count_stride(PNC *pncp, MPI_Offset len = (count == NULL) ? 1 : count[0]; if (shape[0] == 0 && len > 0) /* no record yet */ DEBUG_RETURN_ERROR(NC_EINVALCOORDS) - err = check_EINVALCOORDS(pncp->flag & NC_MODE_STRICT_COORD_BOUND, + err = check_EINVALCOORDS(fIsSet(pncp->flag, NC_MODE_STRICT_COORD_BOUND), start[0], len, shape[0]); if (err != NC_NOERR) return err; } @@ -181,7 +182,7 @@ int check_start_count_stride(PNC *pncp, ndims = pncp->vars[varid].ndims; for (i=firstDim; iflag & NC_MODE_STRICT_COORD_BOUND, + err = check_EINVALCOORDS(fIsSet(pncp->flag, NC_MODE_STRICT_COORD_BOUND), start[i], len, shape[i]); if (err != NC_NOERR) return err; } diff --git a/src/dispatchers/variable.c b/src/dispatchers/variable.c index ba8c92e16e..4d7ca9f0e3 100644 --- a/src/dispatchers/variable.c +++ b/src/dispatchers/variable.c @@ -235,6 +235,76 @@ ncmpi_def_var(int ncid, /* IN: file ID */ return NC_NOERR; } +#ifdef ENABLE_COMPRESSION +/*----< ncmpi_var_set_chunk() >----------------------------------------------------*/ +/* This is a collective subroutine. */ +int ncmpi_var_set_chunk (int ncid, /* IN: file ID */ + int varid, + int *chunk_dim) +{ + int err; + int ndim; + + err = ncmpi_inq_varndims(ncid,varid, &ndim); + if (err != NC_NOERR) return err; + + return ncmpi_put_att_int(ncid, varid, "_chunkdim", NC_INT, ndim, chunk_dim); +} +/*----< ncmpi_var_get_chunk() >----------------------------------------------------*/ +int ncmpi_var_get_chunk (int ncid, /* IN: file ID */ + int varid, + int *chunk_dim) +{ + return ncmpi_get_att_int(ncid, varid, "_chunkdim", chunk_dim); +} +/*----< ncmpi_var_set_filter() >----------------------------------------------------*/ +/* This is a collective subroutine. */ +int ncmpi_var_set_filter (int ncid, /* IN: file ID */ + int varid, + int filter) +{ + return ncmpi_put_att_int(ncid, varid, "_filter", NC_INT, 1, &filter); +} +/*----< ncmpi_var_get_filter() >----------------------------------------------------*/ +int ncmpi_var_get_filter (int ncid, /* IN: file ID */ + int varid, + int *filter) +{ + return ncmpi_get_att_int(ncid, varid, "_filter", filter); +} +#else +/*----< ncmpi_var_set_chunk() >----------------------------------------------------*/ +/* This is a collective subroutine. */ +int ncmpi_var_set_chunk (int ncid, /* IN: file ID */ + int varid, + int *chunk_dim) +{ + return NC_ENOTBUILT; +} +/*----< ncmpi_var_get_chunk() >----------------------------------------------------*/ +int ncmpi_var_get_chunk (int ncid, /* IN: file ID */ + int varid, + int *chunk_dim) +{ + return NC_ENOTBUILT; +} +/*----< ncmpi_var_set_filter() >----------------------------------------------------*/ +/* This is a collective subroutine. */ +int ncmpi_var_set_filter (int ncid, /* IN: file ID */ + int varid, + int filter) +{ + return NC_ENOTBUILT; +} +/*----< ncmpi_var_get_filter() >----------------------------------------------------*/ +int ncmpi_var_get_filter (int ncid, /* IN: file ID */ + int varid, + int *filter) +{ + return NC_ENOTBUILT; +} +#endif + /*----< ncmpi_def_var_fill() >-----------------------------------------------*/ /* this API is collective, and must be called in define mode */ int diff --git a/src/drivers/Makefile.am b/src/drivers/Makefile.am index 3749fcd99e..8829a41ec2 100644 --- a/src/drivers/Makefile.am +++ b/src/drivers/Makefile.am @@ -6,7 +6,7 @@ # # @configure_input@ -SUBDIRS = include common ncmpio +SUBDIRS = include common ncmpio pncio if BUILD_DRIVER_FOO SUBDIRS += ncfoo @@ -20,11 +20,15 @@ if ENABLE_BURST_BUFFER SUBDIRS += ncbbio endif +if ENABLE_CHUNKING + SUBDIRS += ncchunkio +endif + if ENABLE_ADIOS SUBDIRS += ncadios endif -DIST_SUBDIRS = include common ncmpio ncfoo ncbbio nc4io ncadios +DIST_SUBDIRS = include common ncmpio ncfoo ncbbio nc4io ncadios pncio ncchunkio # For VPATH build (parallel build), try delete all sub-directories distclean-local: diff --git a/src/drivers/common/dtype_decode.c b/src/drivers/common/dtype_decode.c index 3c4143e361..163cfabbed 100644 --- a/src/drivers/common/dtype_decode.c +++ b/src/drivers/common/dtype_decode.c @@ -299,7 +299,7 @@ int ncmpii_dtype_decode(MPI_Datatype dtype, MPI_Datatype ptype, *array_of_dtypes=NULL; MPI_Aint *array_of_adds=NULL; #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count num_ints, num_adds, num_larges, num_dtypes, *array_of_larges; + MPI_Count num_ints, num_adds, num_larges, num_dtypes, *array_of_larges=NULL; int *distribs, *dargs, *psizes; MPI_Count *gzises; #else diff --git a/src/drivers/common/mem_alloc.c b/src/drivers/common/mem_alloc.c index 279dd44b4e..7def10c6c1 100644 --- a/src/drivers/common/mem_alloc.c +++ b/src/drivers/common/mem_alloc.c @@ -9,13 +9,15 @@ NCI_Malloc(size) NCI_Calloc(nelems, esize) NCI_Realloc(ptr, size) + NCI_Strdup(ptr) NCI_Free(ptr) In macro.h, they are macro-replaced to - NCI_Malloc_fn(size, __LINE__, __FILE__) and - NCI_Calloc_fn(nelems, esize, __LINE__, __FILE__) and + NCI_Malloc_fn(size, __LINE__, __func__, __FILE__) + NCI_Calloc_fn(nelems, esize, __LINE__, __func__, __FILE__) NCI_Realloc_fn(ptr, size, __LINE__, __func__, __FILE__) - NCI_Free_fn(ptr,__LINE__,__FILE__). + NCI_Strdup_fn(ptr, __LINE__, __func__, __FILE__) + NCI_Free_fn(ptr, __LINE__, __func__, __FILE__). */ #ifdef HAVE_CONFIG_H diff --git a/src/drivers/common/utils.c b/src/drivers/common/utils.c index e60b6e30af..362a81ed14 100644 --- a/src/drivers/common/utils.c +++ b/src/drivers/common/utils.c @@ -61,7 +61,7 @@ ncmpii_xlen_nc_type(nc_type xtype, int *size) } } -/* File system types recognized by ROMIO in MPICH 4.0.0 */ +/* File system types recognized by ROMIO in MPICH 4.0.0, and by PnetCDF */ static const char* fstypes[] = {"ufs", "nfs", "xfs", "pvfs2", "gpfs", "panfs", "lustre", "daos", "testfs", "ime", "quobyte", NULL}; /* Return a pointer to filename by removing the file system type prefix name if @@ -91,3 +91,158 @@ char* ncmpii_remove_file_system_type_prefix(const char *filename) return ret_filename; } +/*----< ncmpii_construct_node_list() >---------------------------------------*/ +/* This subroutine is a collective call. It finds the affinity of each MPI + * process to the compute node and returns the followings: + * num_nodes_ptr Number of unique nodes (host names) + * node_ids_ptr [nprocs] node IDs of each rank, must be freed by caller. + */ +int +ncmpii_construct_node_list(MPI_Comm comm, + int *num_nodes_ptr, /* OUT: */ + int **node_ids_ptr) /* OUT: [nprocs] */ +{ + char my_procname[MPI_MAX_PROCESSOR_NAME], **all_procnames=NULL; + int i, j, k, rank, nprocs, num_nodes, my_procname_len, root=0; + int *node_ids=NULL, *all_procname_lens=NULL; + + MPI_Comm_size(comm, &nprocs); + MPI_Comm_rank(comm, &rank); + + /* Collect host name of alocated compute nodes. Note my_procname is null + * character terminated, but my_procname_len does not include the null + * character. + */ + MPI_Get_processor_name(my_procname, &my_procname_len); +#if 0 +#ifdef MIMIC_LUSTRE +#define MIMIC_NUM_NODES 1 + /* mimic number of compute nodes = MIMIC_NUM_NODES */ + int node_id, np_per_node = nprocs / MIMIC_NUM_NODES; + if (nprocs % MIMIC_NUM_NODES > 0) np_per_node++; + if (rank < np_per_node * (nprocs % MIMIC_NUM_NODES)) + node_id = rank / np_per_node; + else + node_id = (rank - np_per_node * (nprocs % MIMIC_NUM_NODES)) / (nprocs / MIMIC_NUM_NODES) + (nprocs % MIMIC_NUM_NODES); + + sprintf(my_procname,"compute.node.%d", node_id); + my_procname_len = (int)strlen(my_procname); +#endif +#endif + + my_procname_len++; /* to include terminate null character */ + + if (rank == root) { + /* root collects all procnames */ + all_procnames = (char **) NCI_Malloc(sizeof(char*) * nprocs); + if (all_procnames == NULL) + DEBUG_RETURN_ERROR(NC_ENOMEM) + + all_procname_lens = (int *) NCI_Malloc(sizeof(int) * nprocs); + if (all_procname_lens == NULL) { + NCI_Free(all_procnames); + DEBUG_RETURN_ERROR(NC_ENOMEM) + } + } + /* gather process name lengths from all processes first */ + MPI_Gather(&my_procname_len, 1, MPI_INT, all_procname_lens, 1, MPI_INT, + root, comm); + + if (rank == root) { + int *disp; + size_t alloc_size = 0; + + for (i=0; i int -nc4io_create(MPI_Comm comm, - const char *path, - int cmode, - int ncid, - MPI_Info info, - void **ncpp) /* OUT */ +nc4io_create(MPI_Comm comm, + const char *path, + int cmode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { char *filename; int err, ncidtmp; @@ -108,12 +110,14 @@ nc4io_create(MPI_Comm comm, } int -nc4io_open(MPI_Comm comm, - const char *path, - int omode, - int ncid, - MPI_Info info, - void **ncpp) +nc4io_open(MPI_Comm comm, + const char *path, + int omode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { char *filename; int err, ncidtmp; diff --git a/src/drivers/ncadios/ncadios_bp2ncd.c b/src/drivers/ncadios/ncadios_bp2ncd.c index d985ec093f..49c3f6eced 100644 --- a/src/drivers/ncadios/ncadios_bp2ncd.c +++ b/src/drivers/ncadios/ncadios_bp2ncd.c @@ -525,7 +525,7 @@ int ncadiosi_parse_header_bp2ncd (NC_ad *ncid) pg = pg_root; while (pg) { - int i,j; + int j; int var_dims_count = 0; struct var_dim * var_dims = 0; diff --git a/src/drivers/ncadios/ncadios_driver.h b/src/drivers/ncadios/ncadios_driver.h index b8aff5f757..28432d1e15 100644 --- a/src/drivers/ncadios/ncadios_driver.h +++ b/src/drivers/ncadios/ncadios_driver.h @@ -91,10 +91,14 @@ struct NC_ad { }; extern int -ncadios_create(MPI_Comm comm, const char *path, int cmode, int ncid, MPI_Info info, void **ncdp); +ncadios_create(MPI_Comm comm, const char *path, int cmode, int ncid, + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, + void **ncdp); extern int -ncadios_open(MPI_Comm comm, const char *path, int omode, int ncid, MPI_Info info, void **ncdp); +ncadios_open(MPI_Comm comm, const char *path, int omode, int ncid, + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, + void **ncdp); extern int ncadios_close(void *ncdp); diff --git a/src/drivers/ncadios/ncadios_file.c b/src/drivers/ncadios/ncadios_file.c index bede8e461d..5ab07ec0c5 100644 --- a/src/drivers/ncadios/ncadios_file.c +++ b/src/drivers/ncadios/ncadios_file.c @@ -47,24 +47,28 @@ #include int -ncadios_create(MPI_Comm comm, - const char *path, - int cmode, - int ncid, - MPI_Info info, - void **ncpp) /* OUT */ +ncadios_create(MPI_Comm comm, + const char *path, + int cmode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { /* Read only driver */ DEBUG_RETURN_ERROR(NC_ENOTSUPPORT); } int -ncadios_open(MPI_Comm comm, - const char *path, - int omode, - int ncid, - MPI_Info info, - void **ncpp) +ncadios_open(MPI_Comm comm, + const char *path, + int omode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { int err, parse_done=0; NC_ad *ncadp; diff --git a/src/drivers/ncadios/ncadios_lists.c b/src/drivers/ncadios/ncadios_lists.c index 3bf4e29cb4..3618ab0994 100644 --- a/src/drivers/ncadios/ncadios_lists.c +++ b/src/drivers/ncadios/ncadios_lists.c @@ -73,7 +73,7 @@ int ncadiosi_var_list_add(NC_ad_var_list *list, NC_ad_var data) { if (list->nalloc == 0){ list->nalloc = 16; - list->data = NCI_Malloc(sizeof(NC_ad_varp) * list->nalloc); + list->data = NCI_Malloc(sizeof(NC_ad_var) * list->nalloc); } else if (list->nalloc == id){ list->nalloc *= 2; diff --git a/src/drivers/ncbbio/ncbbio_driver.h b/src/drivers/ncbbio/ncbbio_driver.h index b585d2e05c..6c60b8042e 100644 --- a/src/drivers/ncbbio/ncbbio_driver.h +++ b/src/drivers/ncbbio/ncbbio_driver.h @@ -259,11 +259,15 @@ int ncbbio_sharedfile_seek (NC_bb_sharedfile *f, off_t offset, int whence); void ncbbio_extract_hint (NC_bb *ncbbp, MPI_Info info); void ncbbio_export_hint (NC_bb *ncbbp, MPI_Info *info); -extern int ncbbio_create ( - MPI_Comm comm, const char *path, int cmode, int ncid, MPI_Info info, void **ncdp); - -extern int ncbbio_open ( - MPI_Comm comm, const char *path, int omode, int ncid, MPI_Info info, void **ncdp); +extern +int ncbbio_create(MPI_Comm comm, const char *path, int cmode, int ncid, + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, + void **ncdp); + +extern +int ncbbio_open(MPI_Comm comm, const char *path, int omode, int ncid, + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, + void **ncdp); extern int ncbbio_close (void *ncdp); diff --git a/src/drivers/ncbbio/ncbbio_file.c b/src/drivers/ncbbio/ncbbio_file.c index 271f1b969d..76f18b1f21 100644 --- a/src/drivers/ncbbio/ncbbio_file.c +++ b/src/drivers/ncbbio/ncbbio_file.c @@ -49,12 +49,14 @@ #include int -ncbbio_create(MPI_Comm comm, - const char *path, - int cmode, - int ncid, - MPI_Info info, - void **ncpp) /* OUT */ +ncbbio_create(MPI_Comm comm, + const char *path, + int cmode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { int err; void *ncp=NULL; @@ -65,7 +67,8 @@ ncbbio_create(MPI_Comm comm, driver = ncmpio_inq_driver(); if (driver == NULL) DEBUG_RETURN_ERROR(NC_ENOTNC) - err = driver->create(comm, path, cmode, ncid, info, &ncp); + err = driver->create(comm, path, cmode, ncid, env_mode, info, node_ids, + &ncp); if (err != NC_NOERR) return err; /* Create a NC_bb object and save its driver pointer */ @@ -104,12 +107,14 @@ ncbbio_create(MPI_Comm comm, } int -ncbbio_open(MPI_Comm comm, - const char *path, - int omode, - int ncid, - MPI_Info info, - void **ncpp) +ncbbio_open(MPI_Comm comm, + const char *path, + int omode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { int err; void *ncp=NULL; @@ -119,7 +124,7 @@ ncbbio_open(MPI_Comm comm, driver = ncmpio_inq_driver(); if (driver == NULL) DEBUG_RETURN_ERROR(NC_ENOTNC) - err = driver->open(comm, path, omode, ncid, info, &ncp); + err = driver->open(comm, path, omode, ncid, env_mode, info, node_ids, &ncp); if (err != NC_NOERR) return err; /* Create a NC_bb object and save its driver pointer */ @@ -375,6 +380,11 @@ ncbbio_begin_indep_data(void *ncdp) int err; NC_bb *ncbbp = (NC_bb*)ncdp; + err = ncbbio_sync(ncdp); + if (err != NC_NOERR) return err; + + MPI_Barrier(ncbbp->comm); + err = ncbbp->ncmpio_driver->begin_indep_data(ncbbp->ncp); if (err != NC_NOERR) return err; diff --git a/src/drivers/ncbbio/ncbbio_log.c b/src/drivers/ncbbio/ncbbio_log.c index 606f602568..70d78d3d64 100644 --- a/src/drivers/ncbbio/ncbbio_log.c +++ b/src/drivers/ncbbio/ncbbio_log.c @@ -16,6 +16,8 @@ #include /* opendir() closedir() */ #include #include +#include /* dirname() */ + #include #include #include @@ -29,10 +31,9 @@ int ncbbio_log_create(NC_bb* ncbbp, __attribute__((unused)) MPI_Info info) { int rank, np, err, flag, masterrank, procname_len; - char logbase[NC_LOG_MAX_PATH], basename[NC_LOG_MAX_PATH]; + char *logbase=NULL, *basename=NULL; char procname[MPI_MAX_PROCESSOR_NAME]; - char *abspath, *fname, *path, *fdir = NULL; - char *logbasep = "."; + char *fname, *path, *fdir = NULL, *logbasep; #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) double t1, t2; #endif @@ -68,7 +69,7 @@ int ncbbio_log_create(NC_bb* ncbbp, /* Determine log file name * Log file name is $(bufferdir)$(basename)_$(ncid)_$(rank).{meta/data} * filepath is absolute path to the cdf file - * If buffer directory is not set, we use the same directory as the NetCDF file + * If buffer directory is not set, we use the same directory as the file */ /* Read environment variable for burst buffer path */ @@ -86,21 +87,9 @@ int ncbbio_log_create(NC_bb* ncbbp, logbasep = ncmpii_remove_file_system_type_prefix(ncbbp->logbase); } else { - size_t i = strlen(path); - fdir = (char*)NCI_Malloc(sizeof(char) * (i + 1)); - strncpy(fdir, path, i + 1); - /* Search for first '\' from the back */ - for (i--; i > -1; i--) { - if (fdir[i] == '/') { - fdir[i + 1] = '\0'; - break; - } - } - - /* If directory is fund, use it as logbase */ - if (i >= 0) { - logbasep = fdir; - } + fdir = strdup(path); + logbasep = dirname(fdir); + if (logbasep == NULL) logbasep = "."; /* Warn if log base not set by user */ if (rank == 0) { @@ -121,13 +110,13 @@ int ncbbio_log_create(NC_bb* ncbbp, closedir(logdir); /* Resolve absolute path */ - abspath = realpath(path, basename); - if (abspath == NULL) { + basename = realpath(path, NULL); + if (basename == NULL) { /* Can not resolve absolute path */ DEBUG_RETURN_ERROR(NC_EBAD_FILE); } - abspath = realpath(logbasep, logbase); - if (abspath == NULL) { + logbase = realpath(logbasep, NULL); + if (logbase == NULL) { /* Can not resolve absolute path */ DEBUG_RETURN_ERROR(NC_EBAD_FILE); } @@ -305,6 +294,9 @@ int ncbbio_log_create(NC_bb* ncbbp, ncbbp->total_data += 8; #endif + if (basename != NULL) free(basename); + if (logbase != NULL) free(logbase); + return NC_NOERR; } diff --git a/src/drivers/ncchunkio/DEVELOPER_NOTES.md b/src/drivers/ncchunkio/DEVELOPER_NOTES.md new file mode 100644 index 0000000000..99ee038f61 --- /dev/null +++ b/src/drivers/ncchunkio/DEVELOPER_NOTES.md @@ -0,0 +1,74 @@ +# Note for Developers + +### Table of contents +- [Future Work] +- [Internal global attributes] +- [Anchor variable (one per variable with chunking enabled)] +- [Reference table] +- [Chunks] +- [Requirement for compressed variables] + +--- + +## Internal global attributes: + * Number of chunked variables + +## Anchor variable (one per variable with chunking enabled): + * A scalar variable + * Data type is the same as user defined + * Internal attributes + + Dimension IDs are saved as an attribute of an array of integer type + + Number of dimensions is saved as an internal attribute + + An attribute to tell whether it is a fixed-size or record variable + + An attribute offset pointer to reference table + * For fixed-size variable, it is a scalar + * For record variable, it is an array of 8-type integers, one for each record + * This array can be allocated in multiple of 16 for example + * Need an integer for allocated size, e.g. multiple of 16 + * Need an integer for size (true number of records written) + + An attributes for chunk sizes, an integer array + + An attributes for compression algorithm + + An attributes for compression level + * If a variable missing these internal attributes, it is a traditional variable + +## Reference table: + * An array stores offsets of individual chunks + * Not a NetCDF variable. But we use the CDF5 format specification to define it + + TODO: give it a formal spec in BNF grammar + * For a fixed-size variable, it is a 1D array of size equal to the number of chunks + * This table is loaded into memory when calling ncmpi_inq_varid + * For blocking API, it is sync-ed and written to file by root + + TODO: in future, it can be written by multiple ranks in parallel + * For nonblocking API, multiple tables are written by multiple ranks in parallel + +## Chunks: + * Chunks are not NetCDF variables + + TODO: give it a formal spec in BNF grammar? + * Chunks are stored in space between NetCDF variables, i.e. padding areas in files + * Data is type-converted and byte-swapped before compression + * In principle, chunks should be stored in file contiguously with each other, + for all variables. But they are not required to be stored contiguously. + * The storage order of chunks is in row major + +## Requirement for compressed variables: + * Collective I/O only (this is the same required by HDF5) + * Must be chunked (same as HDF5) + + +## Future Work +* Reuse metadata accross variables + - Variable from same simulation space may have same access apttern. + - Instead of generating variable metadata and indexx table separately, we can + share information accross variables. + - Chunk sizeand chunk ownership info can be reused. +* Data seiving + - When rewriting to a chunk, we do't need to read the background if it is + fully overwritten. + - Need an efficient way to determine whether a chunk is fully rewrititen. + - It may be infesible due to communication and computation cost. + - HDF5 approximate this by checking if owner fully rewriten the chunk. +* Reuse metadata accross records + - I/O pattern accross time steps are likely the same. + - If we detect same I/O pattern as previous record, we can skip sending the metadata. + - MPI datatype created for previous timestep can also be reused. +--- diff --git a/src/drivers/ncchunkio/Makefile.am b/src/drivers/ncchunkio/Makefile.am new file mode 100644 index 0000000000..34ce50d763 --- /dev/null +++ b/src/drivers/ncchunkio/Makefile.am @@ -0,0 +1,95 @@ +# +# Copyright (C) 2012, Northwestern University and Argonne National Laboratory +# See COPYRIGHT notice in top-level directory. +# +# $Id: Makefile.am 3283 2017-07-30 21:10:11Z wkliao $ +# +# @configure_input@ + +SUFFIXES = .a .o .c .m4 .h + +AM_CPPFLAGS = -I${top_srcdir}/src/include +AM_CPPFLAGS += -I${top_builddir}/src/include +AM_CPPFLAGS += -I${top_srcdir}/src/drivers/include +AM_CPPFLAGS += -I${top_builddir}/src/drivers/include +AM_CPPFLAGS += -I${top_srcdir}/src/drivers/ncmpio +AM_CPPFLAGS += -I${top_srcdir}/src/drivers/pncio + +if PNETCDF_DEBUG + AM_CPPFLAGS += -DPNETCDF_DEBUG +endif + +noinst_LTLIBRARIES = libncchkio.la + +M4FLAGS += -I${top_srcdir}/m4 +M4FLAGS += -I${top_srcdir}/src/drivers/ncchunkio +if ENABLE_ERANGE_FILL +M4FLAGS += -DERANGE_FILL +endif + +M4_SRCS = ncchkioi_profile.m4 \ + ncchkioi_convert.m4 + +M4H_SRCS = ncchkioi_profile.m4h + +H_SRCS = ncchkio_driver.h + +C_SRCS = ncchkio_attr.c \ + ncchkio_dim.c \ + ncchkio_driver.c \ + ncchkio_file.c \ + ncchkio_var.c \ + ncchkio_internal.c \ + ncchkioi_util.c \ + ncchkioi_put_var.c \ + ncchkioi_get_var.c \ + ncchkioi_put_varn.c \ + ncchkioi_get_varn.c \ + ncchkioi_iput_cb.c \ + ncchkioi_iget_cb.c \ + ncchkioi_iput.c \ + ncchkioi_iget.c \ + ncchkioi_nonblocking.c \ + ncchkioi_cache.c \ + ncchkioi_chunk.c \ + ncchkioi_chunk_size.c \ + ncchkioi_chunk_owner.c \ + ncchkioi_var_init.c \ + ncchkioi_var_resize.c \ + ncchkioi_var_wr.c \ + ncchkioi_var_rd.c \ + ncchkioi_lists.c \ + ncchkioi_wait.c \ + ncchk_filter_dummy.c + +if ENABLE_ZLIB + C_SRCS += ncchk_filter_zlib.c +endif + +if ENABLE_SZ + C_SRCS += ncchk_filter_sz.c +endif + +$(M4_SRCS:.m4=.c): Makefile +$(M4H_SRCS:.m4h=.h): Makefile + +.m4.c: + $(M4) $(AM_M4FLAGS) $(M4FLAGS) $< >$@ + +.m4h.h: + $(M4) $(AM_M4FLAGS) $(M4FLAGS) $< >$@ + +libncchkio_la_SOURCES = $(C_SRCS) $(H_SRCS) +nodist_libncchkio_la_SOURCES = $(M4_SRCS:.m4=.c) $(M4H_SRCS:.m4h=.h) + +# automake says "... BUILT_SOURCES is honored only by 'make all', 'make check', +# and 'make install'. This means you cannot build a specific target (e.g., +# 'make target') in a clean tree if it depends on a built source." +BUILT_SOURCES = $(M4_SRCS:.m4=.c) $(M4H_SRCS:.m4h=.h) + +CLEANFILES = $(M4_SRCS:.m4=.c) $(M4H_SRCS:.m4h=.h) core core.* *.gcda *.gcno *.gcov gmon.out + +EXTRA_DIST = $(M4_HFILES) $(M4_SRCS) $(M4H_SRCS) ncchkioi_profile_timers.m4 + +tests-local: all + diff --git a/src/drivers/ncchunkio/ncchk_filter_driver.h b/src/drivers/ncchunkio/ncchk_filter_driver.h new file mode 100644 index 0000000000..b406587fca --- /dev/null +++ b/src/drivers/ncchunkio/ncchk_filter_driver.h @@ -0,0 +1,29 @@ +#ifndef NCCHK_FILTER_DRIVER_H +#define NCCHK_FILTER_DRIVER_H + +#include + +struct NCCHK_filter { + int (*init)(MPI_Info); + int (*finalize)(); + int (*inq_cpsize)(void*, int, int*, int, int*, MPI_Datatype); + int (*compress)(void*, int, void*, int*, int, int*, MPI_Datatype); + int (*compress_alloc)(void*, int, void**, int*, int, int*, MPI_Datatype); + int (*inq_dcsize)(void*, int, int*, int, int*, MPI_Datatype); + int (*decompress)(void*, int, void*, int*, int, int*, MPI_Datatype); + int (*decompress_alloc)(void*, int, void**, int*, int, int*, MPI_Datatype); +}; + +typedef struct NCCHK_filter NCCHK_filter; + +extern NCCHK_filter* ncchk_dummy_inq_driver(void); + +#if ENABLE_ZLIB +extern NCCHK_filter* ncchk_zlib_inq_driver(void); +#endif + +#if ENABLE_SZ +extern NCCHK_filter* ncchk_sz_inq_driver(void); +#endif + +#endif \ No newline at end of file diff --git a/src/drivers/ncchunkio/ncchk_filter_dummy.c b/src/drivers/ncchunkio/ncchk_filter_dummy.c new file mode 100644 index 0000000000..59e7b87626 --- /dev/null +++ b/src/drivers/ncchunkio/ncchk_filter_dummy.c @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include +#include +#include +#include + +int ncchk_dummy_init(MPI_Info info) { + return NC_NOERR; +} + +int ncchk_dummy_finalize() { + return NC_NOERR; +} + +/* Return an estimated compressed data size + * Actual compressed size should not exceed the estimation + */ +int ncchk_dummy_inq_cpsize(void *in, int in_len, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + *out_len = in_len; + return NC_NOERR; +} + +/* If out_len is large enough, compress the data at in and save it to out. out_len is set to actual compressed data size + * If out_len is NULL, we assume out is large enough for compressed data + */ +int ncchk_dummy_compress(void *in, int in_len, void *out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + if (out_len != NULL){ + // Check output buffer size + if ((*out_len) < in_len){ + DEBUG_RETURN_ERROR(NC_ENOMEM); + } + + // Overwrite output buffer size with actual size + *out_len = in_len; + } + + // Copy data directly as dummy comrpession + memcpy(out, in, in_len); + + return NC_NOERR; +} + +/* Compress the data at in and save it to a newly allocated buffer at out. out_len is set to actual compressed data size + * The caller is responsible to free the buffer + * If out_len is not NULL, it will be set to buffer size allocated + */ +int ncchk_dummy_compress_alloc(void *in, int in_len, void **out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + // Allocate output buffer + *out = (void*)malloc(in_len); + + // Buffer size + if (out_len != NULL) { + *out_len = in_len; + } + + // Copy data directly as dummy comrpession + memcpy(*out, in, in_len); + + return NC_NOERR; +} + +/* Return an estimated decompressed data size + * Actual decompressed size should not exceed the estimation + */ +int ncchk_dummy_inq_dcsize(void *in, int in_len, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + *out_len = in_len; + return NC_NOERR; +} + +/* If out_len is large enough, decompress the data at in and save it to out. out_len is set to actual decompressed size + * If out_len is NULL, we assume out is large enough for decompressed data + */ +int ncchk_dummy_decompress(void *in, int in_len, void *out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + if (out_len != NULL){ + // Check output buffer size + if ((*out_len) < in_len){ + DEBUG_RETURN_ERROR(NC_ENOMEM); + } + + // Overwrite output buffer size with actual size + *out_len = in_len; + } + + // Copy data directly as dummy comrpession + memcpy(out, in, in_len); + + return NC_NOERR; +} + +/* Decompress the data at in and save it to a newly allocated buffer at out. out_len is set to actual decompressed data size + * The caller is responsible to free the buffer + * If out_len is not NULL, it will be set to buffer size allocated + */ +int ncchk_dummy_decompress_alloc(void *in, int in_len, void **out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + // Allocate output buffer + *out = (void*)malloc(in_len); + + // Buffer size + if (out_len != NULL) { + *out_len = in_len; + } + + // Copy data directly as dummy comrpession + memcpy(*out, in, in_len); + + return NC_NOERR; +} + +static NCCHK_filter ncchkio_driver = { + ncchk_dummy_init, + ncchk_dummy_finalize, + ncchk_dummy_inq_cpsize, + ncchk_dummy_compress, + ncchk_dummy_compress_alloc, + ncchk_dummy_inq_dcsize, + ncchk_dummy_decompress, + ncchk_dummy_decompress_alloc +}; + +NCCHK_filter* ncchk_dummy_inq_driver(void) { + return &ncchkio_driver; +} + diff --git a/src/drivers/ncchunkio/ncchk_filter_sz.c b/src/drivers/ncchunkio/ncchk_filter_sz.c new file mode 100644 index 0000000000..dc5ac9aa12 --- /dev/null +++ b/src/drivers/ncchunkio/ncchk_filter_sz.c @@ -0,0 +1,312 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +static int mpi_to_sz_type(MPI_Datatype dtype){ + if (dtype == MPI_FLOAT){ + return SZ_FLOAT; + } + else if (dtype == MPI_DOUBLE){ + return SZ_DOUBLE; + } + else if (dtype == MPI_BYTE){ + return SZ_UINT8; + } + else if (dtype == MPI_CHAR){ + return SZ_INT8; + } + else if (dtype == MPI_SHORT){ + return SZ_INT16; + } + else if (dtype == MPI_UNSIGNED_SHORT){ + return SZ_UINT16; + } + else if (dtype == MPI_INT){ + return SZ_INT32; + } + else if (dtype == MPI_UNSIGNED){ + return SZ_UINT32; + } + else if (dtype == MPI_LONG_LONG){ + return SZ_INT64; + } + else if (dtype == MPI_UNSIGNED_LONG_LONG){ + return SZ_UINT64; + } + + return -1; +} + +int ncchk_sz_init(MPI_Info info) { + sz_params sz; + + memset(&sz, 0, sizeof(sz_params)); + sz.sol_ID = SZ; + sz.sampleDistance = 50; + sz.quantization_intervals = 0; + sz.max_quant_intervals = 65536; + sz.predThreshold = 0.98; + sz.szMode = SZ_BEST_COMPRESSION; + sz.losslessCompressor = ZSTD_COMPRESSOR; + sz.gzipMode = 1; + sz.errorBoundMode = ABS; + sz.absErrBound = 1E-3; + sz.relBoundRatio = 1E-5; + SZ_Init_Params(&sz); + + return NC_NOERR; +} + +int ncchk_sz_finalize() { + SZ_Finalize(); + + return NC_NOERR; +} + +/* Return an estimated compressed data size + * Actual compressed size should not exceed the estimation + */ +int ncchk_sz_inq_cpsize(void *in, int in_len, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + return NC_ENOTSUPPORT; // sz has no size estimation +} + +/* If out_len is large enough, compress the data at in and save it to out. out_len is set to actual compressed data size + * If out_len is NULL, we assume out is large enough for compressed data + */ +int ncchk_sz_compress(void *in, int in_len, void *out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + int err=NC_NOERR; + int i; + int szdtype; + size_t r[4]; + size_t outsize; + void *buf = NULL; + + szdtype = mpi_to_sz_type(dtype); + if (szdtype < 0){ + DEBUG_ASSIGN_ERROR(err, NC_EINVAL) + goto out; + } + + for(i = 0; i < 4; i++){ + if (i < ndim){ + r[i] = dims[i]; + } + else{ + r[i] = 0; + } + } + for(i = 4; i < ndim; i++){ + r[3] *= dims[i]; + } + + buf = SZ_compress(szdtype, in, &outsize, 0, r[3], r[2], r[1], r[0]); + + if (out_len != NULL){ + // If buffer not large enough + if (*out_len < outsize){ + DEBUG_ASSIGN_ERROR(err, NC_ENOMEM) + goto out; + } + + // Size of comrpessed data + *out_len = outsize; + } + + memcpy(out, buf, outsize); + +out: + if (buf != NULL){ + free(buf); + } + + return err; +} + +/* Compress the data at in and save it to a newly allocated buffer at out. out_len is set to actual compressed data size + * The caller is responsible to free the buffer + * If out_len is not NULL, it will be set to buffer size allocated + */ +int ncchk_sz_compress_alloc(void *in, int in_len, void **out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + int err=NC_NOERR; + int i; + int szdtype; + size_t r[4]; + size_t outsize; + void *buf = NULL; + + szdtype = mpi_to_sz_type(dtype); + if (szdtype < 0){ + DEBUG_ASSIGN_ERROR(err, NC_EINVAL) + goto out; + } + + for(i = 0; i < 4; i++){ + if (i < ndim){ + r[i] = dims[i]; + } + else{ + r[i] = 0; + } + } + for(i = 4; i < ndim; i++){ + r[3] *= dims[i]; + } + + *out = SZ_compress(szdtype, in, &outsize, 0, r[3], r[2], r[1], r[0]); + + if (out_len != NULL){ + // Size of comrpessed data + *out_len = outsize; + } + +out: + if (buf != NULL){ + free(buf); + } + + return err; +} + +/* Return an estimated decompressed data size + * Actual decompressed size should not exceed the estimation + */ +int ncchk_sz_inq_dcsize(void *in, int in_len, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + return NC_ENOTSUPPORT; // sz has no size estimation +} + +/* If out_len is large enough, decompress the data at in and save it to out. out_len is set to actual decompressed size + * If out_len is NULL, we assume out is large enough for decompressed data + */ +int ncchk_sz_decompress(void *in, int in_len, void *out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + int err=NC_NOERR; + int i; + size_t r[4]; + int szdtype; + int outsize; + void *buf = NULL; + + szdtype = mpi_to_sz_type(dtype); + if (szdtype < 0){ + DEBUG_ASSIGN_ERROR(err, NC_EINVAL) + goto out; + } + + MPI_Type_size(dtype, &outsize); + for(i = 0; i < 4; i++){ + if (i < ndim){ + r[i] = dims[i]; + outsize *= dims[i]; + } + else{ + r[i] = 0; + } + } + for(i = 4; i < ndim; i++){ + r[3] *= dims[i]; + outsize *= dims[i]; + } + + buf = SZ_decompress(szdtype, in, (size_t)in_len, 0, r[3], r[2], r[1], r[0]); + + if (out_len != NULL){ + // If buffer not large enough + if (*out_len < outsize){ + DEBUG_ASSIGN_ERROR(err, NC_ENOMEM) + goto out; + } + + // Size of comrpessed data + *out_len = outsize; + } + + memcpy(out, buf, outsize); + +out: + if (buf != NULL){ + free(buf); + } + + return err; +} + +/* Decompress the data at in and save it to a newly allocated buffer at out. out_len is set to actual decompressed data size + * The caller is responsible to free the buffer + * If out_len is not NULL, it will be set to buffer size allocated + */ +int ncchk_sz_decompress_alloc(void *in, int in_len, void **out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + int err=NC_NOERR; + int i; + size_t r[4]; + int szdtype; + int outsize; + void *buf = NULL; + + szdtype = mpi_to_sz_type(dtype); + if (szdtype < 0){ + DEBUG_ASSIGN_ERROR(err, NC_EINVAL) + goto out; + } + + MPI_Type_size(dtype, &outsize); + for(i = 0; i < 4; i++){ + if (i < ndim){ + r[i] = dims[i]; + outsize *= dims[i]; + } + else{ + r[i] = 0; + } + } + for(i = 4; i < ndim; i++){ + r[3] *= dims[i]; + outsize *= dims[i]; + } + + *out = SZ_decompress(szdtype, in, (size_t)in_len, 0, r[3], r[2], r[1], r[0]); + + if (out_len != NULL){ + // Size of comrpessed data + *out_len = outsize; + } + +out: + if (buf != NULL){ + free(buf); + } + + return err; +} + +static NCCHK_filter ncchk_driver_sz = { + ncchk_sz_init, + ncchk_sz_finalize, + ncchk_sz_inq_cpsize, + ncchk_sz_compress, + ncchk_sz_compress_alloc, + ncchk_sz_inq_dcsize, + ncchk_sz_decompress, + ncchk_sz_decompress_alloc +}; + +NCCHK_filter* ncchk_sz_inq_driver(void) { + return &ncchk_driver_sz; +} + diff --git a/src/drivers/ncchunkio/ncchk_filter_zlib.c b/src/drivers/ncchunkio/ncchk_filter_zlib.c new file mode 100644 index 0000000000..3db4094b78 --- /dev/null +++ b/src/drivers/ncchunkio/ncchk_filter_zlib.c @@ -0,0 +1,311 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +int ncchk_zlib_init(MPI_Info info) { + return NC_NOERR; +} + +int ncchk_zlib_finalize() { + return NC_NOERR; +} + +/* Return an estimated compressed data size + * Actual compressed size should not exceed the estimation + */ +int ncchk_zlib_inq_cpsize(void *in, int in_len, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + return NC_ENOTSUPPORT; // Zlib has no size estimation +} + +/* If out_len is large enough, compress the data at in and save it to out. out_len is set to actual compressed data size + * If out_len is NULL, we assume out is large enough for compressed data + */ +int ncchk_zlib_compress(void *in, int in_len, void *out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + int err=NC_NOERR; + + // zlib struct + z_stream defstream; + defstream.zalloc = Z_NULL; + defstream.zfree = Z_NULL; + defstream.opaque = Z_NULL; + defstream.avail_in = (uInt)(in_len); // input size + defstream.next_in = (Bytef*)in; // input + if (out_len != NULL){ + defstream.avail_out = (uInt)(*out_len); // output buffer size + } + else{ + defstream.avail_out = (uInt)1000000000; // Assume it is large enough + } + defstream.next_out = (Bytef *)out; // output buffer + + // the actual compression work. + err = deflateInit(&defstream, Z_DEFAULT_COMPRESSION); + if (err != Z_OK){ + printf("deflateInit fail: %d: %s\n", err, defstream.msg); + DEBUG_RETURN_ERROR(NC_EIO) + } + err = deflate(&defstream, Z_FINISH); + if (err != Z_STREAM_END){ + printf("deflate fail: %d: %s\n", err, defstream.msg); + DEBUG_RETURN_ERROR(NC_EIO) + } + err = deflateEnd(&defstream); + if (err != Z_OK){ + printf("deflateEnd fail: %d: %s\n", err, defstream.msg); + DEBUG_RETURN_ERROR(NC_EIO) + } + + // If buffer not large enough + if (defstream.avail_in > 0){ + DEBUG_RETURN_ERROR(NC_ENOMEM) + } + + // Size of comrpessed data + if (out_len != NULL){ + *out_len = defstream.total_out; + } + + return NC_NOERR; +} + +/* Compress the data at in and save it to a newly allocated buffer at out. out_len is set to actual compressed data size + * The caller is responsible to free the buffer + * If out_len is not NULL, it will be set to buffer size allocated + */ +int ncchk_zlib_compress_alloc(void *in, int in_len, void **out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + int err=NC_NOERR; + int bsize; // Start by 1/8 of the in_len + char *buf; + + bsize = in_len >> 3; + if (bsize < 6){ + bsize = 6; + } + buf = (char*)malloc(bsize); + + // zlib struct + z_stream defstream; + defstream.zalloc = Z_NULL; + defstream.zfree = Z_NULL; + defstream.opaque = Z_NULL; + defstream.avail_in = (uInt)(in_len); // input size + defstream.next_in = (Bytef*)in; // input + defstream.avail_out = (uInt)(bsize); // output buffer size + defstream.next_out = (Bytef *)buf; // output buffer + + // Initialize deflat stream + err = deflateInit(&defstream, Z_DEFAULT_COMPRESSION); + if (err != Z_OK){ + printf("deflateInit fail: %d: %s\n", err, defstream.msg); + DEBUG_RETURN_ERROR(NC_EIO) + } + + // The actual compression work + err = Z_OK; + while (err != Z_STREAM_END){ + // Compress data + err = deflate(&defstream, Z_NO_FLUSH | Z_FINISH); + // Check if buffer is lage enough + if (err != Z_STREAM_END){ + // Enlarge buffer + buf = (char*)realloc(buf, bsize << 1); + + // Reset buffer info in stream + defstream.next_out = (Bytef *)(buf + bsize); + defstream.avail_out = bsize; + + // Reocrd new buffer size + bsize <<= 1; + } + } + + // Finalize deflat stream + err = deflateEnd(&defstream); + if (err != Z_OK){ + printf("deflateEnd fail: %d: %s\n", err, defstream.msg); + DEBUG_RETURN_ERROR(NC_EIO) + } + + // Size of comrpessed data + if (out_len != NULL){ + *out_len = defstream.total_out; + + char *env_str; + if ((env_str = getenv("PNETCDF_COMPRESS_VERBOSE")) != NULL) { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD,&rank); + printf("rank %d (%s at %d) compress data size %d into size %d\n", + rank,__func__,__LINE__,in_len,*out_len); + } + } + + // Compressed data + *out = buf; + + return NC_NOERR; +} + +/* Return an estimated decompressed data size + * Actual decompressed size should not exceed the estimation + */ +int ncchk_zlib_inq_dcsize(void *in, int in_len, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + return NC_ENOTSUPPORT; // Zlib has no size estimation +} + +/* If out_len is large enough, decompress the data at in and save it to out. out_len is set to actual decompressed size + * If out_len is NULL, we assume out is large enough for decompressed data + */ +int ncchk_zlib_decompress(void *in, int in_len, void *out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + int err=NC_NOERR; + + // zlib struct + z_stream infstream; + infstream.zalloc = Z_NULL; + infstream.zfree = Z_NULL; + infstream.opaque = Z_NULL; + infstream.avail_in = (unsigned long) in_len; // input size + infstream.next_in = (Bytef *)in; // input + if (out_len != NULL){ + infstream.avail_out = (uInt)(*out_len); // output buffer size + } + else{ + infstream.avail_out = (uInt)1000000000; // Assume it is large enough + } + infstream.next_out = (Bytef *)out; // buffer size + + // the actual decompression work. + err = inflateInit(&infstream); + if (err != Z_OK){ + printf("inflateInit fail: %d: %s\n", err, infstream.msg); + DEBUG_RETURN_ERROR(NC_EIO) + } + err = inflate(&infstream, Z_FINISH); + if (err != Z_STREAM_END){ + printf("inflate fail: %d: %s\n", err, infstream.msg); + DEBUG_RETURN_ERROR(NC_EIO) + } + err = inflateEnd(&infstream); + if (err != Z_OK){ + printf("inflateEnd fail: %d: %s\n", err, infstream.msg); + DEBUG_RETURN_ERROR(NC_EIO) + } + + // If buffer not large enough + if (infstream.avail_in > 0){ + DEBUG_RETURN_ERROR(NC_ENOMEM) + } + + // Size of decomrpessed data + if (out_len != NULL){ + *out_len = infstream.total_out; + + char *env_str; + if ((env_str = getenv("PNETCDF_COMPRESS_VERBOSE")) != NULL) { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD,&rank); + printf("rank %d (%s at %d) decompress data size %d into size %d\n", + rank,__func__,__LINE__,in_len,*out_len); + } + } + + return NC_NOERR; +} + +/* Decompress the data at in and save it to a newly allocated buffer at out. out_len is set to actual decompressed data size + * The caller is responsible to free the buffer + * If out_len is not NULL, it will be set to buffer size allocated + */ +int ncchk_zlib_decompress_alloc(void *in, int in_len, void **out, int *out_len, int ndim, int *dims, MPI_Datatype dtype) { + int err=NC_NOERR; + int bsize = in_len << 1; // Start by 2 times of the in_len + char *buf; + + buf = (char*)malloc(bsize); + + // zlib struct + z_stream infstream; + infstream.zalloc = Z_NULL; + infstream.zfree = Z_NULL; + infstream.opaque = Z_NULL; + infstream.avail_in = (uInt)(in_len); // input size + infstream.next_in = (Bytef*)in; // input + infstream.avail_out = (uInt)(bsize); // output buffer size + infstream.next_out = (Bytef *)buf; // output buffer + + // Initialize deflat stream + err = inflateInit(&infstream); + if (err != Z_OK){ + printf("inflateInit fail: %d: %s\n", err, infstream.msg); + DEBUG_RETURN_ERROR(NC_EIO) + } + + // The actual decompression work + err = Z_OK; + while (err != Z_STREAM_END){ + // Compress data + err = inflate(&infstream, Z_NO_FLUSH | Z_FINISH); + // Check if buffer is lage enough + if (err != Z_STREAM_END){ + // Enlarge buffer + buf = (char*)realloc(buf, bsize << 1); + + // Reset buffer info in stream + infstream.next_out = (Bytef *)(buf + bsize); + infstream.avail_out = bsize; + + // Reocrd new buffer size + bsize <<= 1; + } + } + + // Finalize deflat stream + err = inflateEnd(&infstream); + if (err != Z_OK){ + printf("inflateEnd fail: %d: %s\n", err, infstream.msg); + DEBUG_RETURN_ERROR(NC_EIO) + } + + // Size of comrpessed data + if (out_len != NULL){ + *out_len = infstream.total_out; + } + + // Compressed data + *out = buf; + + return NC_NOERR; +} + +static NCCHK_filter ncchk_driver_zlib = { + ncchk_zlib_init, + ncchk_zlib_finalize, + ncchk_zlib_inq_cpsize, + ncchk_zlib_compress, + ncchk_zlib_compress_alloc, + ncchk_zlib_inq_dcsize, + ncchk_zlib_decompress, + ncchk_zlib_decompress_alloc +}; + +NCCHK_filter* ncchk_zlib_inq_driver(void) { + return &ncchk_driver_zlib; +} + diff --git a/src/drivers/ncchunkio/ncchkio_attr.c b/src/drivers/ncchunkio/ncchkio_attr.c new file mode 100644 index 0000000000..90cb5196a5 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkio_attr.c @@ -0,0 +1,161 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_inq_attname() : dispatcher->inq_attname() + * ncmpi_inq_attid() : dispatcher->inq_attid() + * ncmpi_inq_att() : dispatcher->inq_att() + * ncmpi_rename_att() : dispatcher->inq_rename_att() + * ncmpi_copy_att() : dispatcher->inq_copy_att() + * ncmpi_del_att() : dispatcher->inq_del_att() + * ncmpi_get_att() : dispatcher->inq_get_att() + * ncmpi_put_att() : dispatcher->inq_put_arr() + * + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include + +#include +#include +#include +#include +#include "ncchkio_internal.h" + +int +ncchkio_inq_attname(void *ncdp, + int varid, + int attid, + char *name) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->inq_attname(ncchkp->ncp, varid, attid, name); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int +ncchkio_inq_attid(void *ncdp, + int varid, + const char *name, + int *attidp) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->inq_attid(ncchkp->ncp, varid, name, attidp); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int +ncchkio_inq_att(void *ncdp, + int varid, + const char *name, + nc_type *datatypep, + MPI_Offset *lenp) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->inq_att(ncchkp->ncp, varid, name, datatypep, lenp); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int +ncchkio_rename_att(void *ncdp, + int varid, + const char *name, + const char *newname) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->rename_att(ncchkp->ncp, varid, name, newname); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + + +int +ncchkio_copy_att(void *ncdp_in, + int varid_in, + const char *name, + void *ncdp_out, + int varid_out) +{ + int err=NC_NOERR; + NC_chk *foo_in = (NC_chk*)ncdp_in; + NC_chk *foo_out = (NC_chk*)ncdp_out; + + err = foo_in->driver->copy_att(foo_in->ncp, varid_in, name, + foo_out->ncp, varid_out); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int +ncchkio_del_att(void *ncdp, + int varid, + const char *name) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->del_att(ncchkp->ncp, varid, name); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int +ncchkio_get_att(void *ncdp, + int varid, + const char *name, + void *buf, + MPI_Datatype itype) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->get_att(ncchkp->ncp, varid, name, buf, itype); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int +ncchkio_put_att(void *ncdp, + int varid, + const char *name, + nc_type xtype, + MPI_Offset nelems, + const void *buf, + MPI_Datatype itype) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->put_att(ncchkp->ncp, varid, name, xtype, nelems, buf, + itype); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} diff --git a/src/drivers/ncchunkio/ncchkio_dim.c b/src/drivers/ncchunkio/ncchkio_dim.c new file mode 100644 index 0000000000..ef5cee4fb1 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkio_dim.c @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_def_dim() : dispatcher->def_dim() + * ncmpi_inq_dimid() : dispatcher->inq_dimid() + * ncmpi_inq_dim() : dispatcher->inq_dim() + * ncmpi_rename_dim() : dispatcher->rename_dim() + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include + +#include + +#include +#include +#include +#include "ncchkio_internal.h" + +int +ncchkio_def_dim(void *ncdp, + const char *name, + MPI_Offset size, + int *dimidp) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->def_dim(ncchkp->ncp, name, size, dimidp); + if (err != NC_NOERR) return err; + + if (size == NC_UNLIMITED){ + ncchkp->recdim = *dimidp; + } + + return NC_NOERR; +} + +int +ncchkio_inq_dimid(void *ncdp, + const char *name, + int *dimid) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->inq_dimid(ncchkp->ncp, name, dimid); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int +ncchkio_inq_dim(void *ncdp, + int dimid, + char *name, + MPI_Offset *sizep) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->inq_dim(ncchkp->ncp, dimid, name, sizep); + if (err != NC_NOERR) return err; + + if (dimid == ncchkp->recdim){ // update # records + if (*sizep < ncchkp->recsize){ + *sizep = ncchkp->recsize; + } + } + + return NC_NOERR; +} + +int +ncchkio_rename_dim(void *ncdp, + int dimid, + const char *newname) +{ + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk*)ncdp; + + err = ncchkp->driver->rename_dim(ncchkp->ncp, dimid, newname); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} diff --git a/src/drivers/ncchunkio/ncchkio_driver.c b/src/drivers/ncchunkio/ncchkio_driver.c new file mode 100644 index 0000000000..1fda000f58 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkio_driver.c @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include + +static PNC_driver ncchkio_driver = { + /* FILE APIs */ + ncchkio_create, + ncchkio_open, + ncchkio_close, + ncchkio_enddef, + ncchkio__enddef, + ncchkio_redef, + ncchkio_sync, + ncchkio_flush, + ncchkio_abort, + ncchkio_set_fill, + ncchkio_inq, + ncchkio_inq_misc, + ncchkio_sync_numrecs, + ncchkio_begin_indep_data, + ncchkio_end_indep_data, + + /* DIMENSION APIs */ + ncchkio_def_dim, + ncchkio_inq_dimid, + ncchkio_inq_dim, + ncchkio_rename_dim, + + /* ATTRIBUTE APIs */ + ncchkio_inq_att, + ncchkio_inq_attid, + ncchkio_inq_attname, + ncchkio_copy_att, + ncchkio_rename_att, + ncchkio_del_att, + ncchkio_get_att, + ncchkio_put_att, + + /* VARIABLE APIs */ + ncchkio_def_var, + ncchkio_def_var_fill, + ncchkio_fill_var_rec, + ncchkio_inq_var, + ncchkio_inq_varid, + ncchkio_rename_var, + ncchkio_get_var, + ncchkio_put_var, + ncchkio_get_varn, + ncchkio_put_varn, + ncchkio_get_vard, + ncchkio_put_vard, + ncchkio_iget_var, + ncchkio_iput_var, + ncchkio_bput_var, + ncchkio_iget_varn, + ncchkio_iput_varn, + ncchkio_bput_varn, + + ncchkio_buffer_attach, + ncchkio_buffer_detach, + ncchkio_wait, + ncchkio_cancel +}; + +PNC_driver* ncchkio_inq_driver(void) { + return &ncchkio_driver; +} + diff --git a/src/drivers/ncchunkio/ncchkio_driver.h b/src/drivers/ncchunkio/ncchkio_driver.h new file mode 100644 index 0000000000..94a728d766 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkio_driver.h @@ -0,0 +1,422 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifndef _ncchkio_DRIVER_H +#define _ncchkio_DRIVER_H + +#include +#include +#include +#include + +#include "ncchkioi_profile.h" + +#define NC_CHK_VAR_RAW 0 +#define NC_CHK_VAR_COMPRESSED 1 +#define NC_CHK_VAR_DATA 2 +#define NC_CHK_VAR_META 3 + +#define NC_CHK_MAPPING_STATIC 0 +#define NC_CHK_MAPPING_DYNAMIC 01 + +#define NC_CHK_COMM_CHUNK 0 +#define NC_CHK_COMM_PROC 1 + +#define NC_CHK_ 1 + +/* Chunk cache structure */ +typedef struct NC_chk_cache { + char *buf; // Buffer + size_t bsize; // Size in byte + int serial; // batch number to detect swap out of cache allocated in the same batch + struct NC_chk_cache **ref; // Ref to clr when it is swap out + struct NC_chk_cache *prev; + struct NC_chk_cache *next; +} NC_chk_cache; + +/* Get_req structure */ +typedef struct NC_chk_req { + int varid; + int nreq; + MPI_Offset *start; + MPI_Offset **starts; + MPI_Offset *count; + MPI_Offset **counts; + MPI_Offset *stride; + MPI_Offset bufcount; + MPI_Datatype buftype; + char *buf; + char *xbuf; + char **xbufs; +} NC_chk_req; + +/* Get_req list structure */ +typedef struct NC_chk_req_list { + NC_chk_req *reqs; // Array of request object + int *ids; // Array of request ids + int *pos; // Array of position of request ids in ids + int nalloc; // Size of the pool + int nused; // Number of ids issued +} NC_chk_req_list; + +typedef struct NC_chk_var_chunk { + MPI_Offset *start; + MPI_Offset *xdata_offs; + MPI_Offset *xdata_lens; + int owner; + char *data; + char *xdata; +} NC_chk_var_chunk; + +typedef struct NC_chk_chunk_index_entry { + MPI_Offset off; + int len; +} NC_chk_chunk_index_entry; + +typedef struct NC_chk_var { + int varkind; + int isrec; + int isnew; + + nc_type xtype; + MPI_Datatype etype; + int esize; + + int ndim; + MPI_Offset *dimsize; + int *dimids; + + int varid; + + int nchunk; + int nchunkrec; + int nchunkalloc; + int nrec; + int nrecalloc; + int expanded; + int chunksize; + int *nchunks; + int *cidsteps; + int *chunk_owner; + int *chunkdim; + int *dirty; + NC_chk_cache **chunk_cache; + + int nmychunk; + int nmychunkrec; + int *mychunks; + + MPI_Offset metaoff; + NC_chk_chunk_index_entry *chunk_index; + // MPI_Offset *data_offs; + // int *data_lens; + + NCCHK_filter *filter_driver; /* Compression driver */ + int filter; + + int chunk_map_method; +} NC_chk_var; + +typedef struct NC_chk_var_list { + NC_chk_var *data; + int cnt; + int nalloc; +} NC_chk_var_list; + +typedef struct NC_chk NC_chk; /* forward reference */ +struct NC_chk { + int mode; /* file _open/_create mode */ + int flag; /* define/data/collective/indep mode */ + int rank; + int np; + char *path; /* path name */ + MPI_Comm comm; /* MPI communicator */ + void *ncp; /* pointer to driver's internal object */ + struct PNC_driver *driver; + int blockmapping; + MPI_Offset recsize; /* record dim size */ + MPI_Offset recnalloc; /* record dim allocated */ + MPI_Offset default_recnalloc; + int recdim; /* record dim id */ + NC_chk_var_list vars; + NC_chk_req_list putlist, getlist; + int comm_unit; + int delay_init; + int exact_cown; + int max_ndim; + int max_chunk_size; + MPI_Offset nmychunks; // Sum of nmychunk in everyvar + int default_filter; + int nwrite; + MPI_Offset getsize; + MPI_Offset putsize; + size_t cache_limit; + size_t cache_limit_hint; + size_t cache_used; + int cache_serial; + NC_chk_cache *cache_head; + NC_chk_cache *cache_tail; + int ndim; // Number of dim in file + int *chunkdim; // Default chunk dim for each dimension + MPI_Offset cown_size; // Size of all chunks owned + MPI_Datatype overlaptype; + MPI_Op max_cown_op; + MPI_Offset assigned_chunks; + double cown_ratio; + size_t hdr_reserve; // Additional reserve space in the file header +#ifdef PNETCDF_PROFILING + NC_chk_timers profile; + MPI_Offset sendsize; + MPI_Offset recvsize; + MPI_Offset var_size_sum; + MPI_Offset var_zsize_sum; + int nsend; + int nrecv; + int nremote; + int nreq; + int nlocal; +#endif +}; + +extern int ncchkio_create ( + MPI_Comm comm, const char *path, int cmode, int ncid, int env_mode, MPI_Info info, PNCIO_node_ids node_ids, void **ncdp); + +extern int ncchkio_open ( + MPI_Comm comm, const char *path, int omode, int ncid, int env_mode, MPI_Info info, PNCIO_node_ids node_ids, void **ncdp); + +extern int ncchkio_close (void *ncdp); + +extern int ncchkio_enddef (void *ncdp); + +extern int ncchkio__enddef ( + void *ncdp, MPI_Offset h_minfree, MPI_Offset v_align, MPI_Offset v_minfree, MPI_Offset r_align); + +extern int ncchkio_redef (void *ncdp); + +extern int ncchkio_sync (void *ncdp); + +extern int ncchkio_flush (void *ncdp); + +extern int ncchkio_abort (void *ncdp); + +extern int ncchkio_set_fill (void *ncdp, int fill_mode, int *old_fill_mode); + +extern int ncchkio_fill_var_rec (void *ncdp, int varid, MPI_Offset recno); + +extern int ncchkio_inq (void *ncdp, int *ndimsp, int *nvarsp, int *nattsp, int *xtendimp); + +extern int ncchkio_inq_misc (void *ncdp, + int *pathlen, + char *path, + int *num_fix_varsp, + int *num_rec_varsp, + int *striping_size, + int *striping_count, + MPI_Offset *header_size, + MPI_Offset *header_extent, + MPI_Offset *recsize, + MPI_Offset *put_size, + MPI_Offset *get_size, + MPI_Info *info_used, + int *nreqs, + MPI_Offset *usage, + MPI_Offset *buf_size); + +extern int ncchkio_sync_numrecs (void *ncdp); + +extern int ncchkio_begin_indep_data (void *ncdp); + +extern int ncchkio_end_indep_data (void *ncdp); + +extern int ncchkio_def_dim (void *ncdp, const char *name, MPI_Offset size, int *dimidp); + +extern int ncchkio_inq_dimid (void *ncdp, const char *name, int *dimidp); + +extern int ncchkio_inq_dim (void *ncdp, int dimid, char *name, MPI_Offset *lengthp); + +extern int ncchkio_rename_dim (void *ncdp, int dimid, const char *newname); + +extern int ncchkio_inq_att ( + void *ncdp, int varid, const char *name, nc_type *xtypep, MPI_Offset *lenp); + +extern int ncchkio_inq_attid (void *ncdp, int varid, const char *name, int *idp); + +extern int ncchkio_inq_attname (void *ncdp, int varid, int attnum, char *name); + +extern int ncchkio_copy_att ( + void *ncdp_in, int varid_in, const char *name, void *ncdp_out, int varid_out); + +extern int ncchkio_rename_att (void *ncdp, int varid, const char *name, const char *newname); + +extern int ncchkio_del_att (void *ncdp, int varid, const char *name); + +extern int ncchkio_get_att ( + void *ncdp, int varid, const char *name, void *value, MPI_Datatype itype); + +extern int ncchkio_put_att (void *ncdp, + int varid, + const char *name, + nc_type xtype, + MPI_Offset nelems, + const void *value, + MPI_Datatype itype); + +extern int ncchkio_def_var ( + void *ncdp, const char *name, nc_type type, int ndims, const int *dimids, int *varidp); + +extern int ncchkio_def_var_fill (void *ncdp, int varid, int nofill, const void *fill_value); + +extern int ncchkio_inq_var (void *ncdp, + int varid, + char *name, + nc_type *xtypep, + int *ndimsp, + int *dimids, + int *nattsp, + MPI_Offset *offsetp, + int *no_fill, + void *fill_value); + +extern int ncchkio_inq_varid (void *ncdp, const char *name, int *varid); + +extern int ncchkio_rename_var (void *ncdp, int varid, const char *newname); + +extern int ncchkio_get_var (void *ncdp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode); + +extern int ncchkio_put_var (void *ncdp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode); + +extern int ncchkio_get_varn (void *ncdp, + int varid, + int num, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode); + +extern int ncchkio_put_varn (void *ncdp, + int varid, + int num, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode); + +extern int ncchkio_get_vard (void *ncdp, + int varid, + MPI_Datatype filetype, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode); + +extern int ncchkio_put_vard (void *ncdp, + int varid, + MPI_Datatype filetype, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode); + +extern int ncchkio_iget_var (void *ncdp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *req, + int reqMode); + +extern int ncchkio_iput_var (void *ncdp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *req, + int reqMode); + +extern int ncchkio_bput_var (void *ncdp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *req, + int reqMode); + +extern int ncchkio_iget_varn (void *ncdp, + int varid, + int num, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid, + int reqMode); + +extern int ncchkio_iput_varn (void *ncdp, + int varid, + int num, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid, + int reqMode); + +extern int ncchkio_bput_varn (void *ncdp, + int varid, + int num, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid, + int reqMode); + +extern int ncchkio_buffer_attach (void *ncdp, MPI_Offset bufsize); + +extern int ncchkio_buffer_detach (void *ncdp); + +extern int ncchkio_wait (void *ncdp, int num_reqs, int *req_ids, int *statuses, int reqMode); + +extern int ncchkio_cancel (void *ncdp, int num_reqs, int *req_ids, int *statuses); + +#endif diff --git a/src/drivers/ncchunkio/ncchkio_file.c b/src/drivers/ncchunkio/ncchkio_file.c new file mode 100644 index 0000000000..604d3fad2e --- /dev/null +++ b/src/drivers/ncchunkio/ncchkio_file.c @@ -0,0 +1,841 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs + * + * ncmpi_create() : dispatcher->create() + * ncmpi_open() : dispatcher->open() + * ncmpi_close() : dispatcher->close() + * ncmpi_enddef() : dispatcher->enddef() + * ncmpi__enddef() : dispatcher->_enddef() + * ncmpi_redef() : dispatcher->redef() + * ncmpi_begin_indep_data() : dispatcher->begin_indep_data() + * ncmpi_end_indep_data() : dispatcher->end_indep_data() + * ncmpi_abort() : dispatcher->abort() + * ncmpi_inq() : dispatcher->inq() + * ncmpi_inq_misc() : dispatcher->inq_misc() + * ncmpi_wait() : dispatcher->wait() + * ncmpi_wait_all() : dispatcher->wait() + * ncmpi_cancel() : dispatcher->cancel() + * + * ncmpi_set_fill() : dispatcher->set_fill() + * ncmpi_fill_var_rec() : dispatcher->fill_rec() + * ncmpi_def_var_fill() : dispatcher->def_var_fill() + * ncmpi_inq_var_fill() : dispatcher->inq() + * + * ncmpi_sync() : dispatcher->sync() + * ncmpi_flush() : dispatcher->flush() + * ncmpi_sync_numrecs() : dispatcher->sync_numrecs() + * + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include /* strlen() */ + +#include "../ncmpio/ncmpio_NC.h" +#include "ncchkio_internal.h" + +int ncchkio_create(MPI_Comm comm, + const char *path, + int cmode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ +{ + int err=NC_NOERR; + int one = 1; + void *ncp = NULL; + NC_chk *ncchkp; + PNC_driver *driver = NULL; +#ifdef PNETCDF_PROFILING + double t0; + t0 = MPI_Wtime (); +#endif + + /* TODO: use cmode to determine the true driver */ + driver = ncmpio_inq_driver (); + if (driver == NULL) return NC_ENOTNC; + + err = driver->create(comm, path, cmode | NC_64BIT_DATA, ncid, env_mode, info, node_ids, &ncp); + if (err != NC_NOERR) return err; + + /* Create a NC_chk object and save its driver pointer */ + ncchkp = (NC_chk *)NCI_Malloc (sizeof (NC_chk)); + if (ncchkp == NULL) DEBUG_RETURN_ERROR (NC_ENOMEM) + + ncchkp->path = (char *)NCI_Malloc (strlen (path) + 1); + if (ncchkp->path == NULL) { + NCI_Free (ncchkp); + DEBUG_RETURN_ERROR (NC_ENOMEM) + } + strcpy (ncchkp->path, path); + ncchkp->mode = cmode | NC_WRITE; + ncchkp->driver = driver; + ncchkp->flag = 0; + ncchkp->ncp = ncp; + ncchkp->comm = comm; + MPI_Comm_rank (comm, &(ncchkp->rank)); + MPI_Comm_size (comm, &(ncchkp->np)); + + ncchkioi_init (ncchkp, 1); + + err = ncchkioi_extract_hint (ncchkp, info); + if (err != NC_NOERR) return err; + + err = driver->put_att (ncchkp->ncp, NC_GLOBAL, "_comressed", NC_INT, 1, &one, + MPI_INT); // Mark this file as compressed + if (err != NC_NOERR) return err; + + *ncpp = ncchkp; + + // Timer array is not avaiable until init, can't use NC_CHK_TIMER_START +#ifdef PNETCDF_PROFILING + t0 = MPI_Wtime () - t0; + ncchkp->profile.tt[NC_CHK_TIMER_VAR_INIT] += t0; + ncchkp->profile.tt[NC_CHK_TIMER_TOTAL] += t0; +#endif + + return NC_NOERR; +} + +int ncchkio_open(MPI_Comm comm, + const char *path, + int omode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ +{ + int err=NC_NOERR; + int one = 0; + void *ncp = NULL; + NC_chk *ncchkp = NULL; + PNC_driver *driver = NULL; +#ifdef PNETCDF_PROFILING + double t0; + + t0 = MPI_Wtime (); +#endif + + /* TODO: use omode to determine the true driver */ + driver = ncmpio_inq_driver (); + if (driver == NULL) { + DEBUG_ASSIGN_ERROR (err, NC_ENOTNC) + goto errout; + } + + err = driver->open(comm, path, omode, ncid, env_mode, info, node_ids, &ncp); + if (err != NC_NOERR) goto errout; + + /* Create a NC_chk object and save its driver pointer */ + ncchkp = (NC_chk *)NCI_Malloc (sizeof (NC_chk)); + if (ncchkp == NULL) { + DEBUG_ASSIGN_ERROR (err, NC_ENOMEM) + goto errout; + } + + ncchkp->path = (char *)NCI_Malloc (strlen (path) + 1); + if (ncchkp->path == NULL) { + NCI_Free (ncchkp); + DEBUG_ASSIGN_ERROR (err, NC_ENOMEM) + goto errout; + } + strcpy (ncchkp->path, path); + ncchkp->mode = omode; + ncchkp->driver = driver; + if (ncchkp->mode & NC_WRITE) { + ncchkp->flag = 0; + } else { + ncchkp->flag |= NC_MODE_RDONLY; + } + ncchkp->ncp = ncp; + ncchkp->comm = comm; + MPI_Comm_rank (comm, &(ncchkp->rank)); + MPI_Comm_size (comm, &(ncchkp->np)); + + ncchkioi_init (ncchkp, 0); + + err = ncchkioi_extract_hint (ncchkp, info); + if (err != NC_NOERR) goto errout; + + err = driver->get_att (ncchkp->ncp, NC_GLOBAL, "_comressed", &one, + MPI_INT); // Mark this file as compressed + if (err != NC_NOERR) { + if (err == NC_ENOTATT) { err = NC_EINVAL; } + goto errout; + } + + // Not compressed file + if (one != 1) { + NCI_Free (ncchkp->path); + NCI_Free (ncchkp); + DEBUG_RETURN_ERROR (NC_EINVAL) + } + + err = ncchkioi_get_default_chunk_dim (ncchkp); + if (err != NC_NOERR) return err; + + ncchkioi_parse_var_info (ncchkp); + + *ncpp = ncchkp; + + // Timer array is not avaiable until init, can't use NC_CHK_TIMER_START +#ifdef PNETCDF_PROFILING + t0 = MPI_Wtime () - t0; + ncchkp->profile.tt[NC_CHK_TIMER_VAR_INIT] += t0; + ncchkp->profile.tt[NC_CHK_TIMER_TOTAL] += t0; +#endif + + return NC_NOERR; + +errout: + if (ncp != NULL) { driver->close (ncchkp->ncp); } + if (ncchkp != NULL) { + if (ncchkp->path != NULL) { NCI_Free (ncchkp->path); } + NCI_Free (ncchkp); + } + + return err; +} + +int ncchkio_close (void *ncdp) { + int err=NC_NOERR; +#ifdef PNETCDF_PROFILING + MPI_Offset put_size, get_size; + char *_env_str = getenv ("PNETCDF_SHOW_PERFORMANCE_INFO"); +#endif + NC_chk *ncchkp = (NC_chk *)ncdp; + +#ifdef PNETCDF_PROFILING + if (_env_str != NULL && *_env_str != '0') { ncchkioi_update_statistics (ncchkp); } +#endif + + NC_CHK_TIMER_START (NC_CHK_TIMER_FINALIZE) + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + + if (ncchkp == NULL) DEBUG_RETURN_ERROR (NC_EBADID) + + if (!(ncchkp->flag & NC_MODE_RDONLY)) { + int i; + + NC_CHK_TIMER_START (NC_CHK_TIMER_FINALIZE_META) + + err = ncchkp->driver->redef (ncchkp->ncp); + if (err != NC_NOERR) { return err; } + + // record chunk dim + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (ncchkp->vars.data[i].isnew) { + err = ncchkp->driver->put_att (ncchkp->ncp, ncchkp->vars.data[i].varid, "_chunkdim", + NC_INT, ncchkp->vars.data[i].ndim, + ncchkp->vars.data[i].chunkdim, MPI_INT); + if (err != NC_NOERR) { return err; } + err = + ncchkp->driver->put_att (ncchkp->ncp, ncchkp->vars.data[i].varid, "_filter", + NC_INT, 1, &(ncchkp->vars.data[i].filter), MPI_INT); + if (err != NC_NOERR) { return err; } + } + } + + // Record recsize + err = ncchkp->driver->put_att (ncchkp->ncp, NC_GLOBAL, "_recsize", NC_INT64, 1, + &(ncchkp->recsize), + MPI_LONG_LONG); // Mark this file as compressed + if (err != NC_NOERR) return err; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_FINALIZE_META) + } + +#ifdef PNETCDF_PROFILING + err = ncchkp->driver->inq_misc (ncchkp->ncp, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, &put_size, &get_size, NULL, NULL, NULL, NULL); + CHK_ERR + ncchkp->putsize += put_size; + ncchkp->getsize += get_size; +#endif + err = ncchkp->driver->close (ncchkp->ncp); + CHK_ERR + + ncchkioi_cache_free (ncchkp); + + err = ncchkioi_var_list_free (&(ncchkp->vars)); + CHK_ERR + + err = ncchkioi_req_list_free (&(ncchkp->putlist)); + CHK_ERR + err = ncchkioi_req_list_free (&(ncchkp->getlist)); + CHK_ERR + + NCI_Free (ncchkp->chunkdim); + + if (ncchkp->overlaptype != MPI_DATATYPE_NULL) { MPI_Type_free (&(ncchkp->overlaptype)); } + if (ncchkp->max_cown_op != MPI_OP_NULL) { MPI_Op_free (&(ncchkp->max_cown_op)); } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_FINALIZE) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + +#ifdef PNETCDF_PROFILING + if (_env_str != NULL && *_env_str != '0') { + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_PUT_SIZE, + (double)ncchkp->putsize / 1048576.0f); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_GET_SIZE, + (double)ncchkp->getsize / 1048576.0f); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_SEND_SIZE, + (double)ncchkp->sendsize / 1048576.0f); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_RECV_SIZE, + (double)ncchkp->recvsize / 1048576.0f); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_NSEND, (double)ncchkp->nsend); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_NRECV, (double)ncchkp->nrecv); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_NREMOTE, (double)ncchkp->nremote); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_NREQ, (double)ncchkp->nreq); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_NLOCAL, (double)ncchkp->nlocal); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_NCHUNK, (double)ncchkp->nmychunks); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_VAR_SIZE, + (double)ncchkp->var_size_sum / 1048576.0f); + ncchkioi_profile_add_time (ncchkp, NC_CHK_TIMER_VAR_ZSIZE, + (double)ncchkp->var_zsize_sum / 1048576.0f); + + ncchkioi_print_profile (ncchkp); + } +#endif + +err_out:; + + NCI_Free (ncchkp->path); + + NCI_Free (ncchkp); + + return err; +} + +int ncchkio_enddef (void *ncdp) { + int err=NC_NOERR, ret; + int i; + MPI_Offset logrecnalloc, drecnalloc; + MPI_Offset rsize; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT) + + drecnalloc = 1; + logrecnalloc = 0; + while (drecnalloc < ncchkp->default_recnalloc) { + logrecnalloc++; + drecnalloc <<= 1; + } + + // Reserve header space + rsize = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + varp = ncchkp->vars.data + i; + rsize = 0; + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + if (varp->isrec) { + rsize += + ((8 + 16) + 4 + 8 + 4) * 8 + ((8 + 16) + 4 + 8 + 4 * varp->ndim) * 2; // Atts + rsize += ((8 + 32) + 8) * (ncchkp->default_recnalloc + logrecnalloc + 1); // dims + rsize += ((8 + 32) + 8 + 8 + (8 + 8 + (8 + 12 + 4 + 8 + 4)) + 4 + 8 + 8) * + (ncchkp->default_recnalloc + 2 * logrecnalloc); // vars + } else { + rsize += + ((8 + 16) + 4 + 8 + 4) * 8 + ((8 + 16) + 4 + 8 + 4 * varp->ndim) * 2; // Atts + rsize += ((8 + 32) + 8) * 3; // dims + rsize += + ((8 + 32) + 8 + 8 + (8 + 8 + (8 + 12 + 4 + 8 + 4)) + 4 + 8 + 8) * 3; // vars + } + } else { + rsize += ((8 + 16) + 4 + 8 + 4); // Atts + } + } + //rsize *= 2; // 2 times for future expension + // Add additional reserve size + rsize += ncchkp->hdr_reserve; + + err = ncchkp->driver->_enddef (ncchkp->ncp, rsize, 0, 0, 0); + if (err != NC_NOERR) return err; + + err = ncchkioi_get_default_chunk_dim (ncchkp); + if (err != NC_NOERR) return err; + + if (!(ncchkp->delay_init)) { + int nread; + int *lens; + MPI_Aint *fdisps, *mdisps; + MPI_Datatype ftype, mtype; + MPI_Status status; + NC_chk_var *varp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_META) + + lens = NCI_Malloc (sizeof (int) * ncchkp->vars.cnt); + fdisps = NCI_Malloc (sizeof (MPI_Aint) * ncchkp->vars.cnt * 2); + mdisps = fdisps + ncchkp->vars.cnt; + + nread = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + varp = ncchkp->vars.data + i; + + ncchkioi_var_init (ncchkp, varp, 0, NULL, NULL); + + if (!(varp->isnew)) { + ret = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_metaoffset", + &(varp->metaoff), MPI_LONG_LONG); + if (ret == NC_NOERR) { + lens[nread] = sizeof (NC_chk_chunk_index_entry) * (varp->nchunk); + fdisps[nread] = varp->metaoff; + mdisps[nread++] = (MPI_Aint) (varp->chunk_index); + } else { + varp->metaoff = -1; + memset (varp->chunk_index, 0, + sizeof (NC_chk_chunk_index_entry) * (varp->nchunk + 1)); + } + } + } + + if (nread) { + ncchkioi_sort_file_offset (nread, fdisps, mdisps, lens); + + MPI_Type_create_hindexed (nread, lens, fdisps, MPI_BYTE, &ftype); + CHK_ERR_TYPE_COMMIT (&ftype); + + MPI_Type_create_hindexed (nread, lens, mdisps, MPI_BYTE, &mtype); + CHK_ERR_TYPE_COMMIT (&mtype); + + // Set file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, + ((NC *)(ncchkp->ncp))->begin_var, MPI_BYTE, ftype, "native", + MPI_INFO_NULL); + + // Read data + CHK_ERR_READ_AT_ALL (((NC *)(ncchkp->ncp))->collective_fh, 0, MPI_BOTTOM, 1, mtype, + &status); + + // Restore file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", + MPI_INFO_NULL); + +#ifdef WORDS_BIGENDIAN // Switch back to big endian + ncchkioi_idx_in_swapn (varp->chunk_index, varp->nchunk + 1); +#endif + MPI_Type_free (&ftype); + MPI_Type_free (&mtype); + } + + NCI_Free (lens); + NCI_Free (fdisps); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_META) + } + +err_out:; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + return err; +} + +int ncchkio__enddef (void *ncdp, + MPI_Offset h_minfree, + MPI_Offset v_align, + MPI_Offset v_minfree, + MPI_Offset r_align) { + int err=NC_NOERR, ret; + int i; + MPI_Offset logrecnalloc, drecnalloc; + MPI_Offset rsize; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT) + + drecnalloc = 1; + logrecnalloc = 0; + while (drecnalloc < ncchkp->default_recnalloc) { + logrecnalloc++; + drecnalloc <<= 1; + } + + // Reserve header space + rsize = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + varp = ncchkp->vars.data + i; + rsize = 0; + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + if (varp->isrec) { + rsize += + ((8 + 16) + 4 + 8 + 4) * 8 + ((8 + 16) + 4 + 8 + 4 * varp->ndim) * 2; // Atts + rsize += ((8 + 32) + 8) * (ncchkp->default_recnalloc + logrecnalloc + 1); // dims + rsize += ((8 + 32) + 8 + 8 + (8 + 8 + (8 + 12 + 4 + 8 + 4)) + 4 + 8 + 8) * + (ncchkp->default_recnalloc + 2 * logrecnalloc); // vars + } else { + rsize += + ((8 + 16) + 4 + 8 + 4) * 8 + ((8 + 16) + 4 + 8 + 4 * varp->ndim) * 2; // Atts + rsize += ((8 + 32) + 8) * 3; // dims + rsize += + ((8 + 32) + 8 + 8 + (8 + 8 + (8 + 12 + 4 + 8 + 4)) + 4 + 8 + 8) * 3; // vars + } + } else { + rsize += ((8 + 16) + 4 + 8 + 4); // Atts + } + } + rsize *= 2; // 2 times for future expension + + err = ncchkp->driver->_enddef (ncchkp->ncp, h_minfree + rsize, v_align, v_minfree, r_align); + if (err != NC_NOERR) return err; + + err = ncchkioi_get_default_chunk_dim (ncchkp); + if (err != NC_NOERR) return err; + + if (!(ncchkp->delay_init)) { + int nread; + int *lens; + MPI_Aint *fdisps, *mdisps; + MPI_Datatype ftype, mtype; + MPI_Status status; + NC_chk_var *varp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_META) + + lens = NCI_Malloc (sizeof (int) * ncchkp->vars.cnt); + fdisps = NCI_Malloc (sizeof (MPI_Aint) * ncchkp->vars.cnt * 2); + mdisps = fdisps + ncchkp->vars.cnt; + + nread = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + varp = ncchkp->vars.data + i; + + err = ncchkioi_var_init (ncchkp, varp, 0, NULL, NULL); + CHK_ERR + + if (!(varp->isnew)) { + ret = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_metaoffset", + &(varp->metaoff), MPI_LONG_LONG); + if (ret == NC_NOERR) { + lens[nread] = sizeof (NC_chk_chunk_index_entry) * (varp->nchunk); + fdisps[nread] = varp->metaoff; + mdisps[nread++] = (MPI_Aint) (varp->chunk_index); + } else { + varp->metaoff = -1; + memset (varp->chunk_index, 0, + sizeof (NC_chk_chunk_index_entry) * (varp->nchunk + 1)); + } + } + } + + if (nread) { + ncchkioi_sort_file_offset (nread, fdisps, mdisps, lens); + + MPI_Type_create_hindexed (nread, lens, fdisps, MPI_BYTE, &ftype); + CHK_ERR_TYPE_COMMIT (&ftype); + + MPI_Type_create_hindexed (nread, lens, mdisps, MPI_BYTE, &mtype); + CHK_ERR_TYPE_COMMIT (&mtype); + + // Set file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, + ((NC *)(ncchkp->ncp))->begin_var, MPI_BYTE, ftype, "native", + MPI_INFO_NULL); + + // Read data + CHK_ERR_READ_AT_ALL (((NC *)(ncchkp->ncp))->collective_fh, 0, MPI_BOTTOM, 1, mtype, + &status); + + // Restore file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", + MPI_INFO_NULL); + +#ifdef WORDS_BIGENDIAN // Switch back to big endian + ncchkioi_idx_in_swapn (varp->chunk_index, varp->nchunk + 1); +#endif + + MPI_Type_free (&ftype); + MPI_Type_free (&mtype); + } + + NCI_Free (lens); + NCI_Free (fdisps); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_META) + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + +err_out:; + return err; +} + +int ncchkio_redef (void *ncdp) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->redef (ncchkp->ncp); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_begin_indep_data (void *ncdp) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->begin_indep_data (ncchkp->ncp); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_end_indep_data (void *ncdp) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->end_indep_data (ncchkp->ncp); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_abort (void *ncdp) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + if (ncchkp == NULL) DEBUG_RETURN_ERROR (NC_EBADID) + + err = ncchkp->driver->abort (ncchkp->ncp); + + NCI_Free (ncchkp->path); + NCI_Free (ncchkp); + + return err; +} + +int ncchkio_inq (void *ncdp, int *ndimsp, int *nvarsp, int *nattsp, int *xtendimp) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->inq (ncchkp->ncp, ndimsp, NULL, nattsp, xtendimp); + if (err != NC_NOERR) return err; + + if (nvarsp != NULL) { *nvarsp = ncchkp->vars.cnt; } + + return NC_NOERR; +} + +int ncchkio_inq_misc (void *ncdp, + int *pathlen, + char *path, + int *num_fix_varsp, + int *num_rec_varsp, + int *striping_size, + int *striping_count, + MPI_Offset *header_size, + MPI_Offset *header_extent, + MPI_Offset *recsize, + MPI_Offset *put_size, + MPI_Offset *get_size, + MPI_Info *info_used, + int *nreqs, + MPI_Offset *usage, + MPI_Offset *buf_size) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->inq_misc (ncchkp->ncp, pathlen, path, num_fix_varsp, num_rec_varsp, + striping_size, striping_count, header_size, header_extent, + recsize, put_size, get_size, info_used, nreqs, usage, buf_size); + if (err != NC_NOERR) return err; + + if (num_fix_varsp != NULL) { *num_fix_varsp = ncchkp->vars.cnt; } + + if (nreqs != NULL) { *nreqs = ncchkp->putlist.nused + ncchkp->getlist.nused; } + + if (put_size != NULL) { *put_size += ncchkp->putsize; } + + if (get_size != NULL) { *get_size += ncchkp->getsize; } + + return NC_NOERR; +} + +int ncchkio_cancel (void *ncdp, int num_req, int *req_ids, int *statuses) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->cancel (ncchkp->ncp, num_req, req_ids, statuses); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_wait (void *ncdp, int num_reqs, int *req_ids, int *statuses, int reqMode) { + int err = NC_NOERR, status = NC_NOERR; + int i; + int ncom = 0, nraw = 0; + int *rawreqs = NULL, *comreqs = NULL; + int *rawstats = NULL, *comstats = NULL; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_START (NC_CHK_TIMER_WAIT) + + if (num_reqs < 0) { // NC_REQ_ALL || nreqs == NC_PUT_REQ_ALL || nreqs == NC_GET_REQ_ALL + err = ncchkioi_wait (ncchkp, num_reqs, NULL, NULL, reqMode); + if (status == NC_NOERR) { status = err; } + err = ncchkp->driver->wait (ncchkp->ncp, num_reqs, NULL, NULL, reqMode); + if (status == NC_NOERR) { status = err; } + goto done; + } + + if (num_reqs > 0) { + // Count number of get and put requests + for (i = 0; i < num_reqs; i++) { + if (req_ids[i] != NC_REQ_NULL) nraw++; + // if (req_ids[i] & 1) { nraw++; } + } + + // Allocate buffer + ncom = num_reqs - nraw; + rawreqs = (int *)NCI_Malloc (sizeof (int) * nraw); + CHK_PTR (rawreqs) + comreqs = (int *)NCI_Malloc (sizeof (int) * ncom); + CHK_PTR (comreqs) + + // Build put and get req list + nraw = ncom = 0; + for (i = 0; i < num_reqs; i++) { + if (req_ids[i] & 1) { + rawreqs[nraw++] = req_ids[i] >> 1; + } else { + comreqs[ncom++] = req_ids[i] >> 1; + } + } + } + + if (statuses != NULL) { + rawstats = (int *)NCI_Malloc (sizeof (int) * nraw); + CHK_PTR (rawstats) + comstats = (int *)NCI_Malloc (sizeof (int) * ncom); + CHK_PTR (comstats) + } else { + rawstats = NULL; + comstats = NULL; + } + + if (nraw > 0 || reqMode == NC_REQ_COLL) { + err = ncchkp->driver->wait (ncchkp->ncp, nraw, rawreqs, rawstats, reqMode); + if (status == NC_NOERR) { status = err; } + } + + if (ncom > 0 || reqMode == NC_REQ_COLL) { + err = ncchkioi_wait (ncchkp, ncom, comreqs, comstats, reqMode); + if (status == NC_NOERR) { status = err; } + } + + // Assign stats + if (statuses != NULL) { + nraw = ncom = 0; + for (i = 0; i < num_reqs; i++) { + if (req_ids[i] & 1) { + statuses[i] = rawstats[nraw++]; + } else { + statuses[i] = comstats[ncom++]; + } + } + + NCI_Free (rawstats); + NCI_Free (comstats); + } + + NCI_Free (rawreqs); + NCI_Free (comreqs); + +err_out:; + if (status == NC_NOERR) status = err; +done:; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_WAIT) + + return status; +} + +int ncchkio_set_fill (void *ncdp, int fill_mode, int *old_fill_mode) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->set_fill (ncchkp->ncp, fill_mode, old_fill_mode); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_fill_var_rec (void *ncdp, int varid, MPI_Offset recno) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->fill_var_rec (ncchkp->ncp, varid, recno); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_def_var_fill (void *ncdp, int varid, int no_fill, const void *fill_value) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->def_var_fill (ncchkp->ncp, varid, no_fill, fill_value); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_sync_numrecs (void *ncdp) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->sync_numrecs (ncchkp->ncp); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_sync (void *ncdp) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->sync (ncchkp->ncp); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_flush (void *ncdp) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->flush (ncchkp->ncp); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} diff --git a/src/drivers/ncchunkio/ncchkio_internal.c b/src/drivers/ncchunkio/ncchkio_internal.c new file mode 100644 index 0000000000..9ad02e6e7f --- /dev/null +++ b/src/drivers/ncchunkio/ncchkio_internal.c @@ -0,0 +1,226 @@ +/* + * Copyright (C) 2018, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "../ncmpio/ncmpio_NC.h" +#include "ncchkio_internal.h" + +int ncchkioi_init (NC_chk *ncchkp, int isnew) { + int err=NC_NOERR; + + ncchkp->max_ndim = 0; + ncchkp->max_chunk_size = 0; + ncchkp->getsize = 0; + ncchkp->putsize = 0; + ncchkp->nmychunks = 0; + ncchkp->nwrite = 0; + ncchkp->cache_head = NULL; + ncchkp->cache_tail = NULL; + ncchkp->cache_used = 0; + ncchkp->cache_limit = 0; + ncchkp->cache_serial = 0; + ncchkp->ndim = 0; + ncchkp->chunkdim = NULL; + ncchkp->assigned_chunks = 0; + ncchkp->cown_size = 0; + ncchkp->max_cown_op = MPI_OP_NULL; + ncchkp->overlaptype = MPI_DATATYPE_NULL; + + err = ncchkp->driver->inq (ncchkp->ncp, NULL, NULL, NULL, &(ncchkp->recdim)); + if (err != NC_NOERR) return err; + + if (isnew) { + ncchkp->recsize = 0; + } else { + err = ncchkp->driver->get_att (ncchkp->ncp, NC_GLOBAL, "_recsize", &(ncchkp->recsize), + MPI_LONG_LONG); + CHK_ERR // Mark this file as compressed + } + + /* Initialize var list */ + err = ncchkioi_var_list_init (&(ncchkp->vars)); + if (err != NC_NOERR) return err; + + /* Initialize nonblocking list */ + err = ncchkioi_req_list_init (&(ncchkp->getlist)); + if (err != NC_NOERR) return err; + err = ncchkioi_req_list_init (&(ncchkp->putlist)); + if (err != NC_NOERR) return err; + +#ifdef PNETCDF_PROFILING + memset (&(ncchkp->profile), 0, sizeof (NC_chk_timers)); + ncchkp->sendsize = 0; + ncchkp->recvsize = 0; + ncchkp->nsend = 0; + ncchkp->nrecv = 0; + ncchkp->nremote = 0; + ncchkp->nreq = 0; + ncchkp->nlocal = 0; +#endif + +err_out:; + return err; +} + +int ncchkioi_parse_var_info (NC_chk *ncchkp) { + int err=NC_NOERR, ret; + int vid; + int i; + int nvar; + int varkind; + NC_chk_var *varp; + + int nread; + int *lens; + MPI_Aint *fdisps, *mdisps; + MPI_Datatype ftype, mtype; + MPI_Status status; + + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_META) + + err = ncchkp->driver->inq (ncchkp->ncp, NULL, &nvar, NULL, &(ncchkp->recdim)); + CHK_ERR + + if (nvar > 0) { + for (vid = 0; vid < nvar; vid++) { + err = ncchkp->driver->get_att (ncchkp->ncp, vid, "_varkind", &varkind, + MPI_INT); // Comressed var? + if (err != NC_NOERR) { continue; } + + if (varkind == NC_CHK_VAR_COMPRESSED || varkind == NC_CHK_VAR_RAW) { + err = ncchkioi_var_list_add (&(ncchkp->vars)); + if (err < 0) return err; + varp = ncchkp->vars.data + err; + + memset (varp, 0, sizeof (NC_chk_var)); + + varp->varid = vid; + varp->varkind = varkind; + + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + err = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_ndim", &(varp->ndim), + MPI_INT); // Original dimensions + if (err != NC_NOERR) return err; + + varp->dimids = (int *)NCI_Malloc (sizeof (int) * varp->ndim); + CHK_PTR (varp->dimids) + varp->dimsize = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim); + CHK_PTR (varp->dimsize) + + err = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_dimids", + varp->dimids, MPI_INT); // Dimensiona IDs + if (err != NC_NOERR) return err; + + for (i = 0; i < varp->ndim; i++) { + ncchkp->driver->inq_dim (ncchkp->ncp, varp->dimids[i], NULL, + varp->dimsize + i); + } + if (varp->dimids[0] == ncchkp->recdim) { + varp->isrec = 1; + if (varp->dimsize[0] < ncchkp->recsize) { + varp->dimsize[0] = ncchkp->recsize; + } + } else { + varp->isrec = 0; + } + + err = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_datatype", + &(varp->xtype), MPI_INT); // Original datatype + if (err != NC_NOERR) return err; + + varp->esize = NC_Type_size (varp->xtype); + varp->etype = ncmpii_nc2mpitype (varp->xtype); + varp->chunkdim = NULL; + } + } + } + + // Collective read index table + if (!(ncchkp->delay_init)) { + lens = NCI_Malloc (sizeof (int) * nvar); + CHK_PTR (lens) + fdisps = NCI_Malloc (sizeof (MPI_Aint) * nvar * 2); + CHK_PTR (fdisps) + mdisps = fdisps + nvar; + + nread = 0; + for (vid = 0; vid < ncchkp->vars.cnt; vid++) { + varp = ncchkp->vars.data + vid; + + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + // Init var + err = ncchkioi_var_init (ncchkp, varp, 0, NULL, NULL); + CHK_ERR + + ret = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_metaoffset", + &(varp->metaoff), MPI_LONG_LONG); + if (ret == NC_NOERR) { + lens[nread] = sizeof (NC_chk_chunk_index_entry) * (varp->nchunk); + fdisps[nread] = varp->metaoff; + mdisps[nread++] = (MPI_Aint) (varp->chunk_index); + } else { + varp->metaoff = -1; + memset (varp->chunk_index, 0, + sizeof (NC_chk_chunk_index_entry) * (varp->nchunk + 1)); + } + } + } + + if (nread) { + ncchkioi_sort_file_offset (nread, fdisps, mdisps, lens); + + MPI_Type_create_hindexed (nread, lens, fdisps, MPI_BYTE, &ftype); + CHK_ERR_TYPE_COMMIT (&ftype); + + MPI_Type_create_hindexed (nread, lens, mdisps, MPI_BYTE, &mtype); + CHK_ERR_TYPE_COMMIT (&mtype); + + // Set file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, + ((NC *)(ncchkp->ncp))->begin_var, MPI_BYTE, ftype, "native", + MPI_INFO_NULL); + + // Read data + CHK_ERR_READ_AT_ALL (((NC *)(ncchkp->ncp))->collective_fh, 0, MPI_BOTTOM, 1, mtype, + &status); + + // Restore file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, 0, MPI_BYTE, MPI_BYTE, + "native", MPI_INFO_NULL); + +#ifdef WORDS_BIGENDIAN // Switch back to little endian + ncchkioi_idx_in_swapn (varp - chunk_index, varp->nchunk + 1); +#endif + + MPI_Type_free (&ftype); + MPI_Type_free (&mtype); + } + + for (vid = 0; vid < ncchkp->vars.cnt; vid++) { + varp = ncchkp->vars.data + vid; + } + + NCI_Free (lens); + NCI_Free (fdisps); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_META) + +err_out:; + return err; +} diff --git a/src/drivers/ncchunkio/ncchkio_internal.h b/src/drivers/ncchunkio/ncchkio_internal.h new file mode 100644 index 0000000000..992670b41d --- /dev/null +++ b/src/drivers/ncchunkio/ncchkio_internal.h @@ -0,0 +1,390 @@ +#ifndef _ncchkio_INTERNAL_H +#define _ncchkio_INTERNAL_H + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include "ncchkio_driver.h" +#ifdef PNETCDF_DEBUG +#include +#endif + +#define NC_CHK_FILTER_NONE 0 +#define NC_CHK_FILTER_DUMMY 1 +#define NC_CHK_FILTER_ZLIB 2 +#define NC_CHK_FILTER_SZ 3 + +#define NC_CHK_DEFAULT_REC_ALLOC 1024 +#define NC_CHK_REC_MULTIPLIER 2 + +#ifdef PNETCDF_DEBUG +#define DEBUG_ABORT \ + { \ + char *_env_str = getenv ("PNETCDF_ABORT_ON_ERR"); \ + if (_env_str != NULL && *_env_str != '0') { abort (); } \ + } +#else +#define DEBUG_ABORT +#endif + +#define RET_ERR(E) \ + { \ + err = E; \ + DEBUG_TRACE_ERROR (err); \ + DEBUG_ABORT \ + goto err_out; \ + } +#define CHK_ERR \ + if (err != NC_NOERR) { \ + DEBUG_ABORT \ + goto err_out; \ + } + +#define CHK_MPIERR \ + if (err != MPI_SUCCESS) { \ + err = ncmpii_error_mpi2nc (err, "MPI"); \ + DEBUG_TRACE_ERROR (err); \ + DEBUG_ABORT \ + goto err_out; \ + } + +#define CHK_PTR(P) \ + if (!P) { \ + err = NC_ENOMEM; \ + DEBUG_TRACE_ERROR (err); \ + DEBUG_ABORT \ + goto err_out; \ + } + +#define CHK_ERR_WAIT(V0, V1) \ + err = MPI_Wait (V0, V1); \ + CHK_MPIERR + +#define CHK_ERR_ALLREDUCE(V0, V1, V2, V3, V4, V5) \ + err = MPI_Allreduce (V0, V1, V2, V3, V4, V5); \ + CHK_MPIERR + +#define CHK_ERR_IALLREDUCE(V0, V1, V2, V3, V4, V5, V6) \ + err = MPI_Iallreduce (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR + +#define CHK_ERR_REDUCE(V0, V1, V2, V3, V4, V5, V6) \ + err = MPI_Reduce (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR + +#define CHK_ERR_GATHER(V0, V1, V2, V3, V4, V5, V6, V7) \ + err = MPI_Gather (V0, V1, V2, V3, V4, V5, V6, V7); \ + CHK_MPIERR + +#ifdef PNETCDF_DEBUG +#define CHK_ERR_PACK(V0, V1, V2, V3, V4, V5, V6) \ + { \ + assert ((V0) != NULL); \ + assert ((V3) != NULL); \ + err = MPI_Pack (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR \ + } +#else +#define CHK_ERR_PACK(V0, V1, V2, V3, V4, V5, V6) \ + err = MPI_Pack (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR +#endif + +#ifdef PNETCDF_DEBUG +#define CHK_ERR_UNPACK(V0, V1, V2, V3, V4, V5, V6) \ + { \ + int esize; \ + MPI_Type_size (V5, &esize); \ + if (V1 - *((int *)(V2)) < V4 * esize) { abort (); } \ + err = MPI_Unpack (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR \ + } +#else +#define CHK_ERR_UNPACK(V0, V1, V2, V3, V4, V5, V6) \ + err = MPI_Unpack (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR +#endif + +#define CHK_ERR_TYPE_COMMIT(V0) \ + err = MPI_Type_commit (V0); \ + CHK_MPIERR + +#ifdef PNETCDF_DEBUG +#define CHK_ERR_TYPE_CREATE_SUBARRAY(V0, V1, V2, V3, V4, V5, V6) \ + { \ + int d; \ + for (d = 0; d < V0; d++) { \ + if (V1[d] < V2[d] + V3[d]) { \ + printf ( \ + "Error: Subarray outside array at dim %d. size = %d, ssize = %d, start = " \ + "%d\n", \ + d, V1[d], V2[d], V3[d]); \ + abort (); \ + } \ + if (V2[d] <= 0) { \ + printf ("Error: Subarray size <= 0 at dim %d. ssize = %d\n", d, V2[d]); \ + abort (); \ + } \ + } \ + err = MPI_Type_create_subarray (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR \ + } +#else +#define CHK_ERR_TYPE_CREATE_SUBARRAY(V0, V1, V2, V3, V4, V5, V6) \ + err = MPI_Type_create_subarray (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR +#endif + +#define CHK_ERR_WAITALL(V0, V1, V2) \ + err = MPI_Waitall (V0, V1, V2); \ + CHK_MPIERR +#define CHK_ERR_MPROBE(V0, V1, V2, V3, V4) \ + err = MPI_Mprobe (V0, V1, V2, V3, V4); \ + CHK_MPIERR + +#define CHK_ERR_GET_COUNT(V0, V1, V2) \ + err = MPI_Get_count (V0, V1, V2); \ + CHK_MPIERR + +#define CHK_ERR_IMRECV(V0, V1, V2, V3, V4) \ + err = MPI_Imrecv (V0, V1, V2, V3, V4); \ + CHK_MPIERR + +#ifdef PNETCDF_DEBUG +#define CHK_ERR_ISEND(V0, V1, V2, V3, V4, V5, V6) \ + assert (V1 >= 0); \ + err = MPI_Isend (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR +#else +#define CHK_ERR_ISEND(V0, V1, V2, V3, V4, V5, V6) \ + err = MPI_Isend (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR +#endif + +#ifdef PNETCDF_DEBUG +#define CHK_ERR_IRECV(V0, V1, V2, V3, V4, V5, V6) \ + assert (V1 >= 0); \ + err = MPI_Irecv (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR +#else +#define CHK_ERR_IRECV(V0, V1, V2, V3, V4, V5, V6) \ + err = MPI_Irecv (V0, V1, V2, V3, V4, V5, V6); \ + CHK_MPIERR +#endif + +#define CHK_ERR_SET_VIEW(V0, V1, V2, V3, V4, V5) \ + err = MPI_File_set_view (V0, V1, V2, V3, V4, V5); \ + CHK_MPIERR + +#define CHK_ERR_READ_AT_ALL(V0, V1, V2, V3, V4, V5) \ + err = MPI_File_read_at_all (V0, V1, V2, V3, V4, V5); \ + CHK_MPIERR + +#define CHK_ERR_WRITE_AT_ALL(V0, V1, V2, V3, V4, V5) \ + err = MPI_File_write_at_all (V0, V1, V2, V3, V4, V5); \ + CHK_MPIERR + +#define CHK_ALLOC(V0) \ + if (V0 == NULL) { DEBUG_RETURN_ERROR (NC_ENOMEM) } + +typedef struct NC_chk_vector { + int esize; + int size; + int nalloc; + char *data; +} NC_chk_vector; + +// File +extern int ncchkioi_init (NC_chk *, int); +extern int ncchkioi_parse_var_info (NC_chk *); +extern int ncchkioi_var_list_init (NC_chk_var_list *); +extern int ncchkioi_var_list_free (NC_chk_var_list *); +extern int ncchkioi_var_list_add (NC_chk_var_list *); + +// Util +extern int ncchkioi_extract_hint (NC_chk *, MPI_Info); +extern int ncchkioi_export_hint (NC_chk *, MPI_Info); +extern MPI_Offset NC_Type_size (nc_type); +extern void ncchkioi_sort_file_offset (int, MPI_Aint *, MPI_Aint *, int *); +extern int ncchkioi_update_statistics (NC_chk *); +extern int ncchkioi_get_default_chunk_dim (NC_chk *); +extern int ncchkioi_subarray_off_len (int, int *, int *, int *, MPI_Offset *, int *); +extern void ncchkioi_idx_in_swapn (NC_chk_chunk_index_entry *, MPI_Offset); +#ifdef PNETCDF_PROFILING +extern void ncchkioi_print_profile (NC_chk *); +extern void ncchkioi_profile_add_time (NC_chk *ncchkp, int id, double t); +#endif + +// Misc +typedef struct ncchkioi_chunk_overlap_t { + MPI_Offset osize; + int rank; +} ncchkioi_chunk_overlap_t; +extern int ncchkioi_init_nvar_core_reduce (NC_chk *ncchkp, + int nvar, + NC_chk_var **varps, + int *rcnt, + int *roff, + MPI_Offset **starts, + MPI_Offset **counts); +extern int ncchkioi_calc_chunk_overlap (NC_chk *ncchkp, + NC_chk_var *varp, + int nreq, + MPI_Offset **starts, + MPI_Offset **counts, + ncchkioi_chunk_overlap_t *ocnt); +extern void ncchkioi_assign_chunk_owner (NC_chk *ncchkp, + NC_chk_var *varp, + ncchkioi_chunk_overlap_t *ocnt); +extern int ncchkioi_sync_ocnt_reduce (NC_chk *ncchkp, + int nchunk, + ncchkioi_chunk_overlap_t *ocnt, + ncchkioi_chunk_overlap_t *ocnt_all, + MPI_Request *req); +extern void ncchkioi_write_chunk_ocnt (NC_chk *ncchkp, + NC_chk_var *varp, + void *ocnt, + size_t ocnt_size); +extern int ncchkioi_calc_chunk_owner (NC_chk *, NC_chk_var *, int, MPI_Offset **, MPI_Offset **); +extern int ncchkioi_calc_chunk_owner_reduce ( + NC_chk *ncchkp, NC_chk_var *varp, int nreq, MPI_Offset **starts, MPI_Offset **counts); +extern int ncchkioi_calc_chunk_size (NC_chk *, NC_chk_var *, int, MPI_Offset **, MPI_Offset **); +extern int ncchkioiconvert (void *, void *, MPI_Datatype, MPI_Datatype, int); + +// Var +extern int ncchkioi_var_init (NC_chk *, NC_chk_var *, int, MPI_Offset **, MPI_Offset **); +extern int ncchkioi_load_var (NC_chk *, NC_chk_var *, int, int *); +extern int ncchkioi_load_var_bg (NC_chk *, NC_chk_var *, int, int *); +extern int ncchkioi_load_nvar (NC_chk *, int, int *, int *, int *); +extern int ncchkioi_load_nvar_bg (NC_chk *, int, int *, int *, int *); +extern int ncchkioi_save_var (NC_chk *, NC_chk_var *); +extern int ncchkioi_save_nvar (NC_chk *, int, int *); +extern void ncchkioi_var_free (NC_chk_var *); +extern int ncchkioi_var_resize (NC_chk *, NC_chk_var *); +extern int ncchkioi_init_nvar (NC_chk *, int, int *, int, int *); +extern int ncchkioi_resize_nvar (NC_chk *, int, int *, int, int *); + +// Cache +extern int ncchkioi_cache_alloc (NC_chk *, MPI_Offset, NC_chk_cache **); +extern void ncchkioi_cache_visit (NC_chk *, NC_chk_cache *); +extern void ncchkioi_cache_free (NC_chk *); + +// Chunks +extern int ncchkioi_chunk_itr_init ( + NC_chk_var *, const MPI_Offset *, const MPI_Offset *, MPI_Offset *, int *); +extern int ncchkioi_chunk_itr_next ( + NC_chk_var *, const MPI_Offset *, const MPI_Offset *, MPI_Offset *, int *); +extern MPI_Offset get_chunk_overlap ( + NC_chk_var *, MPI_Offset *, const MPI_Offset *, const MPI_Offset *, MPI_Offset *, MPI_Offset *); +extern int get_chunk_id (NC_chk_var *, MPI_Offset *); +extern int get_chunk_itr (NC_chk_var *, int, MPI_Offset *); +extern int ncchkioi_chunk_itr_init_ex (NC_chk_var *, + const MPI_Offset *, + const MPI_Offset *, + MPI_Offset *, + int *, + MPI_Offset *, + MPI_Offset *); +extern int ncchkioi_chunk_itr_next_ex (NC_chk_var *, + const MPI_Offset *, + const MPI_Offset *, + MPI_Offset *, + int *, + MPI_Offset *, + MPI_Offset *); + +// Get +// extern int ncchkioi_get_var_old(NC_chk*, NC_chk_var*, MPI_Offset*, MPI_Offset*, MPI_Offset*, +// void*); +extern int ncchkioi_get_var_cb_chunk ( + NC_chk *, NC_chk_var *, const MPI_Offset *, const MPI_Offset *, const MPI_Offset *, void *); +extern int ncchkioi_get_var_cb_proc ( + NC_chk *, NC_chk_var *, const MPI_Offset *, const MPI_Offset *, const MPI_Offset *, void *); +extern int ncchkioi_get_varn ( + NC_chk *, NC_chk_var *, int, MPI_Offset *const *, MPI_Offset *const *, const void *); +extern int ncchkioi_get_varn_cb_chunk (NC_chk *, + NC_chk_var *, + int, + MPI_Offset *const *, + MPI_Offset *const *, + MPI_Offset *const *, + void **); +extern int ncchkioi_get_varn_cb_proc ( + NC_chk *, NC_chk_var *, int, MPI_Offset *const *, MPI_Offset *const *, void **); +extern int ncchkioi_iget_var (NC_chk *, + int, + const MPI_Offset *, + const MPI_Offset *, + const MPI_Offset *, + const MPI_Offset *, + void *, + MPI_Offset, + MPI_Datatype, + int *); +extern int ncchkioi_iget_varn (NC_chk *, + int, + int, + MPI_Offset *const *, + MPI_Offset *const *, + void *, + MPI_Offset, + MPI_Datatype, + int *); +extern int ncchkioi_iget_cb_chunk (NC_chk *, int, int *, int *); +extern int ncchkioi_iget_cb_proc (NC_chk *, int, int *, int *); + +// Put +// extern int ncchkioi_put_var_old(NC_chk*, NC_chk_var*, const MPI_Offset*, const MPI_Offset*, const +// MPI_Offset*, void*); +extern int ncchkioi_put_var ( + NC_chk *, NC_chk_var *, const MPI_Offset *, const MPI_Offset *, const MPI_Offset *, void *); +extern int ncchkioi_put_var_cb_chunk ( + NC_chk *, NC_chk_var *, const MPI_Offset *, const MPI_Offset *, const MPI_Offset *, void *); +extern int ncchkioi_put_var_cb_proc ( + NC_chk *, NC_chk_var *, const MPI_Offset *, const MPI_Offset *, const MPI_Offset *, void *); +extern int ncchkioi_put_varn ( + NC_chk *, NC_chk_var *, int, MPI_Offset *const *, MPI_Offset *const *, const void *); +extern int ncchkioi_put_varn_cb_chunk (NC_chk *, + NC_chk_var *, + int, + MPI_Offset *const *, + MPI_Offset *const *, + MPI_Offset *const *, + void **); +extern int ncchkioi_put_varn_cb_proc ( + NC_chk *, NC_chk_var *, int, MPI_Offset *const *, MPI_Offset *const *, void **); +extern int ncchkioi_iput_var (NC_chk *, + int, + const MPI_Offset *, + const MPI_Offset *, + const MPI_Offset *, + const void *, + const void *, + int *); +extern int ncchkioi_iput_varn (NC_chk *, + int, + int, + MPI_Offset *const *, + MPI_Offset *const *, + const void *, + const void *, + int *); +extern int ncchkioi_iput_cb_chunk (NC_chk *, int, int *, int *); +extern int ncchkioi_iput_cb_proc (NC_chk *, int, int *, int *); + +// Nonblocking +extern int ncchkioi_req_list_init (NC_chk_req_list *); +extern int ncchkioi_req_list_free (NC_chk_req_list *); +extern int ncchkioi_req_list_add (NC_chk_req_list *, int *); +extern int ncchkioi_req_list_remove (NC_chk_req_list *, int); +extern int ncchkioi_wait_put_reqs (NC_chk *, int, int *, int *); +extern int ncchkioi_wait_get_reqs (NC_chk *, int, int *, int *); +extern int ncchkioi_wait (NC_chk *, int, int *, int *, int); + +// Vector +extern int ncchkioi_vector_init (NC_chk_vector *, int); +extern int ncchkioi_vector_init_ex (NC_chk_vector *, int, int); +extern void ncchkioi_vector_free (NC_chk_vector *); +extern int ncchkioi_vector_append (NC_chk_vector *, void *); +#endif diff --git a/src/drivers/ncchunkio/ncchkio_var.c b/src/drivers/ncchunkio/ncchkio_var.c new file mode 100644 index 0000000000..1e65d6d5a2 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkio_var.c @@ -0,0 +1,1125 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_def_var() : dispatcher->def_var() + * ncmpi_inq_varid() : dispatcher->inq_varid() + * ncmpi_inq_var() : dispatcher->inq_var() + * ncmpi_rename_var() : dispatcher->rename_var() + * + * ncmpi_get_var() : dispatcher->get_var() + * ncmpi_put_var() : dispatcher->put_var() + * ncmpi_get_var_() : dispatcher->get_var() + * ncmpi_put_var_() : dispatcher->put_var() + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + * + * ncmpi_iget_var() : dispatcher->iget_var() + * ncmpi_iput_var() : dispatcher->iput_var() + * ncmpi_iget_var_() : dispatcher->iget_var() + * ncmpi_iput_var_() : dispatcher->iput_var() + * + * ncmpi_buffer_attach() : dispatcher->buffer_attach() + * ncmpi_buffer_detach() : dispatcher->buffer_detach() + * ncmpi_bput_var_() : dispatcher->bput_var() + * + * ncmpi_get_varn_() : dispatcher->get_varn() + * ncmpi_put_varn_() : dispatcher->put_varn() + * + * ncmpi_iget_varn_() : dispatcher->iget_varn() + * ncmpi_iput_varn_() : dispatcher->iput_varn() + * ncmpi_bput_varn_() : dispatcher->bput_varn() + * + * ncmpi_get_vard() : dispatcher->get_vard() + * ncmpi_put_vard() : dispatcher->put_vard() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "ncmpio_NC.h" +#include "ncchkio_internal.h" + +int ncchkio_def_var ( + void *ncdp, const char *name, nc_type xtype, int ndims, const int *dimids, int *varidp) { + int i, err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + NC_chk_var *varp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT) + + err = ncchkioi_var_list_add (&(ncchkp->vars)); + if (err < 0) return err; + *varidp = err; + + varp = ncchkp->vars.data + (*varidp); + + varp->ndim = ndims; + varp->chunkdim = NULL; + varp->chunk_index = NULL; + varp->chunk_owner = NULL; + varp->xtype = xtype; + varp->esize = NC_Type_size (xtype); + varp->etype = ncmpii_nc2mpitype (xtype); + varp->isnew = 1; + varp->expanded = 0; + + if (ndims < 1) { // Do not compress scalar + varp->varkind = NC_CHK_VAR_RAW; + varp->dimsize = NULL; + + err = ncchkp->driver->def_var (ncchkp->ncp, name, xtype, ndims, dimids, &varp->varid); + if (err != NC_NOERR) return err; + + err = ncchkp->driver->put_att (ncchkp->ncp, varp->varid, "_varkind", NC_INT, 1, + &(varp->varkind), MPI_INT); // Comressed var? + if (err != NC_NOERR) return err; + } else { + err = ncchkp->driver->def_var (ncchkp->ncp, name, xtype, 0, NULL, + &varp->varid); // Dummy var for attrs + if (err != NC_NOERR) return err; + + varp->varkind = NC_CHK_VAR_COMPRESSED; + varp->dimids = (int *)NCI_Malloc (sizeof (int) * ndims); + memcpy (varp->dimids, dimids, sizeof (int) * ndims); + varp->dimsize = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * ndims); + for (i = 0; i < ndims; i++) { + ncchkp->driver->inq_dim (ncchkp->ncp, dimids[i], NULL, varp->dimsize + i); + } + if (varp->dimids[0] == ncchkp->recdim) { + varp->isrec = 1; + } else { + varp->isrec = 0; + } + + err = ncchkp->driver->put_att (ncchkp->ncp, varp->varid, "_ndim", NC_INT, 1, &ndims, + MPI_INT); // Original dimensions + if (err != NC_NOERR) return err; + err = ncchkp->driver->put_att (ncchkp->ncp, varp->varid, "_dimids", NC_INT, ndims, dimids, + MPI_INT); // Dimensiona IDs + if (err != NC_NOERR) return err; + err = ncchkp->driver->put_att (ncchkp->ncp, varp->varid, "_datatype", NC_INT, 1, &xtype, + MPI_INT); // Original datatype + if (err != NC_NOERR) return err; + err = ncchkp->driver->put_att (ncchkp->ncp, varp->varid, "_varkind", NC_INT, 1, + &(varp->varkind), MPI_INT); // Comressed var? + if (err != NC_NOERR) return err; + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + return NC_NOERR; +} + +int ncchkio_inq_varid (void *ncdp, const char *name, int *varid) { + int i, vid, err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + + err = ncchkp->driver->inq_varid (ncchkp->ncp, name, &vid); + if (err != NC_NOERR) return err; + + if (varid != NULL) { + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (ncchkp->vars.data[i].varid == vid) { + *varid = i; + break; + } + } + if (i >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_ENOTVAR) } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + return NC_NOERR; +} + +int ncchkio_inq_var (void *ncdp, + int varid, + char *name, + nc_type *xtypep, + int *ndimsp, + int *dimids, + int *nattsp, + MPI_Offset *offsetp, + int *no_fillp, + void *fill_valuep) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + NC_chk_var *varp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + + varp = ncchkp->vars.data + varid; + + err = ncchkp->driver->inq_var (ncchkp->ncp, varp->varid, name, xtypep, NULL, NULL, nattsp, + offsetp, no_fillp, fill_valuep); + if (err != NC_NOERR) return err; + + if (ndimsp != NULL) { *ndimsp = varp->ndim; } + + if (dimids != NULL) { memcpy (dimids, varp->dimids, sizeof (int) * varp->ndim); } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + return NC_NOERR; +} + +int ncchkio_rename_var (void *ncdp, int varid, const char *newname) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + NC_chk_var *varp; + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + err = ncchkp->driver->rename_var (ncchkp->ncp, varp->varid, newname); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_get_var (void *ncdp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode) { + int err=NC_NOERR, status = NC_NOERR, ret; + void *cbuf = (void *)buf; + void *xbuf = (void *)buf; + MPI_Offset nelem; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET) + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + if (varp->varkind == NC_CHK_VAR_RAW) { + return ncchkp->driver->get_var (ncchkp->ncp, varp->varid, start, count, stride, imap, buf, + bufcount, buftype, reqMode); + } + + if (ncchkp->delay_init && (varp->chunkdim == NULL)) { + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_META) + + err = ncchkioi_var_init (ncchkp, varp, 1, (MPI_Offset **)&start, (MPI_Offset **)&count); + CHK_ERR + + if (!(varp->isnew)) { + ret = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_metaoffset", + &(varp->metaoff), MPI_LONG_LONG); + if (ret == NC_NOERR) { // Read index table + MPI_Status status; + + // Set file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, + ((NC *)(ncchkp->ncp))->begin_var, MPI_BYTE, MPI_BYTE, "native", + MPI_INFO_NULL); + // Read data + CHK_ERR_READ_AT_ALL ( + ((NC *)(ncchkp->ncp))->collective_fh, varp->metaoff, varp->chunk_index, + sizeof (NC_chk_chunk_index_entry) * varp->nchunk, MPI_BYTE, &status); + } else { + varp->metaoff = -1; + memset (varp->chunk_index, 0, + sizeof (NC_chk_chunk_index_entry) * (varp->nchunk + 1)); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_META) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET) + } + + if (varp->isrec && (varp->dimsize[0] < ncchkp->recsize) && + (start[0] + count[0] >= varp->dimsize[0])) { + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_RESIZE) + + err = ncchkioi_var_resize (ncchkp, varp); + CHK_ERR + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_RESIZE) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET) + } + + if (buftype != varp->etype) { + int i; + + nelem = 1; + for (i = 0; i < varp->ndim; i++) { nelem *= count[i]; } + + xbuf = (char *)NCI_Malloc (nelem * varp->esize); + CHK_PTR (xbuf) + } else { + xbuf = cbuf; + } + + // Collective buffer + switch (ncchkp->comm_unit) { + case NC_CHK_COMM_CHUNK: + err = ncchkioi_get_var_cb_chunk (ncchkp, varp, start, count, stride, xbuf); + break; + case NC_CHK_COMM_PROC: + err = ncchkioi_get_var_cb_proc (ncchkp, varp, start, count, stride, xbuf); + break; + } + CHK_ERR + + if (buftype != varp->etype) { + err = ncchkioiconvert (xbuf, cbuf, varp->etype, buftype, nelem); + if (err != NC_NOERR) return err; + } + + if (xbuf != cbuf) NCI_Free (xbuf); + if (cbuf != buf) NCI_Free (cbuf); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + +err_out:; + if (status == NC_NOERR) status = err; + return status; /* first error encountered */ +} + +int ncchkio_put_var (void *ncdp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode) { + int err=NC_NOERR, ret; + void *cbuf = (void *)buf; + void *xbuf = (void *)buf; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT) + + if (reqMode == NC_REQ_INDEP) { DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); } + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + if (varp->varkind == NC_CHK_VAR_RAW) { + return ncchkp->driver->put_var (ncchkp->ncp, varp->varid, start, count, stride, imap, buf, + bufcount, buftype, reqMode); + } + + if (ncchkp->delay_init && (varp->chunkdim == NULL)) { + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_PUT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_META) + + err = ncchkioi_var_init (ncchkp, varp, 1, (MPI_Offset **)&start, (MPI_Offset **)&count); + CHK_ERR + + if (!(varp->isnew)) { + ret = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_metaoffset", + &(varp->metaoff), MPI_LONG_LONG); + if (ret == NC_NOERR) { // Read index table + MPI_Status status; + + // Set file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, + ((NC *)(ncchkp->ncp))->begin_var, MPI_BYTE, MPI_BYTE, "native", + MPI_INFO_NULL); + // Read data + CHK_ERR_READ_AT_ALL ( + ((NC *)(ncchkp->ncp))->collective_fh, varp->metaoff, varp->chunk_index, + sizeof (NC_chk_chunk_index_entry) * varp->nchunk, MPI_BYTE, &status); + } else { + varp->metaoff = -1; + memset (varp->chunk_index, 0, + sizeof (NC_chk_chunk_index_entry) * (varp->nchunk + 1)); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_META) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT) + } + + if (imap != NULL || bufcount != -1) { + /* pack buf to cbuf -------------------------------------------------*/ + /* If called from a true varm API or a flexible API, ncmpii_pack() + * packs user buf into a contiguous cbuf (need to be freed later). + * Otherwise, cbuf is simply set to buf. ncmpii_pack() also returns + * etype (MPI primitive datatype in buftype), and nelems (number of + * etypes in buftype * bufcount) + */ + int ndims; + MPI_Offset nelems; + MPI_Datatype etype; + + err = ncchkp->driver->inq_var (ncchkp->ncp, varid, NULL, NULL, &ndims, NULL, NULL, NULL, + NULL, NULL); + if (err != NC_NOERR) goto err_check; + + err = ncmpii_pack (ndims, count, imap, (void *)buf, bufcount, buftype, &nelems, &etype, + &cbuf); + if (err != NC_NOERR) goto err_check; + + imap = NULL; + bufcount = (nelems == 0) ? 0 : -1; /* make it a high-level API */ + buftype = etype; /* an MPI primitive type */ + } + +err_check: + if (err != NC_NOERR) { + if (reqMode & NC_REQ_INDEP) return err; + reqMode |= NC_REQ_ZERO; /* participate collective call */ + } + + if (buftype != varp->etype) { + int i; + MPI_Offset nelem; + + nelem = 1; + for (i = 0; i < varp->ndim; i++) { nelem *= count[i]; } + + xbuf = (char *)NCI_Malloc (nelem * varp->esize); + CHK_PTR (xbuf) + err = ncchkioiconvert (cbuf, xbuf, buftype, varp->etype, nelem); + if (err != NC_NOERR) return err; + } else { + xbuf = cbuf; + } + + err = ncchkioi_put_var (ncchkp, varp, start, count, stride, xbuf); + CHK_ERR + + if (cbuf != buf) NCI_Free (cbuf); + + if (xbuf != cbuf) NCI_Free (xbuf); + if (cbuf != buf) NCI_Free (cbuf); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + +err_out:; + return err; /* first error encountered */ +} + +int ncchkio_iget_var (void *ncdp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid, + int reqMode) { + int err=NC_NOERR; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_START (NC_CHK_TIMER_IGET) + + if (reqMode == NC_REQ_INDEP) { DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); } + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + if (varp->varkind == NC_CHK_VAR_RAW) { + err = ncchkp->driver->iget_var (ncchkp->ncp, varp->varid, start, count, stride, imap, buf, + bufcount, buftype, reqid, reqMode); + if (err != NC_NOERR) { return err; } + if (reqid != NULL) { *reqid = *reqid * 2 + 1; } + return NC_NOERR; + } + + ncchkioi_iget_var (ncchkp, varid, start, count, stride, imap, buf, bufcount, buftype, reqid); + if (reqid != NULL) { (*reqid) *= 2; } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_IGET) + + return NC_NOERR; +} + +int ncchkio_iput_var (void *ncdp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid, + int reqMode) { + int err=NC_NOERR; + void *cbuf = (void *)buf; + void *xbuf = (void *)buf; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_START (NC_CHK_TIMER_IPUT) + + if (reqMode == NC_REQ_INDEP) { DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); } + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + if (varp->varkind == NC_CHK_VAR_RAW) { + err = ncchkp->driver->iput_var (ncchkp->ncp, varp->varid, start, count, stride, imap, buf, + bufcount, buftype, reqid, reqMode); + if (err != NC_NOERR) { return err; } + if (reqid != NULL) { *reqid = *reqid * 2 + 1; } + return NC_NOERR; + } + + if (varp->isrec) { + if (ncchkp->recsize < start[0] + count[0]) { ncchkp->recsize = start[0] + count[0]; } + } + + if (imap != NULL || bufcount != -1) { + /* pack buf to cbuf -------------------------------------------------*/ + /* If called from a true varm API or a flexible API, ncmpii_pack() + * packs user buf into a contiguous cbuf (need to be freed later). + * Otherwise, cbuf is simply set to buf. ncmpii_pack() also returns + * etype (MPI primitive datatype in buftype), and nelems (number of + * etypes in buftype * bufcount) + */ + int ndims; + MPI_Offset nelems; + MPI_Datatype etype; + + err = ncchkp->driver->inq_var (ncchkp->ncp, varid, NULL, NULL, &ndims, NULL, NULL, NULL, + NULL, NULL); + if (err != NC_NOERR) goto err_check; + + err = ncmpii_pack (ndims, count, imap, (void *)buf, bufcount, buftype, &nelems, &etype, + &cbuf); + if (err != NC_NOERR) goto err_check; + + imap = NULL; + bufcount = (nelems == 0) ? 0 : -1; /* make it a high-level API */ + buftype = etype; /* an MPI primitive type */ + } + +err_check: + if (err != NC_NOERR) { + if (reqMode & NC_REQ_INDEP) return err; + reqMode |= NC_REQ_ZERO; /* participate collective call */ + } + + if (buftype != varp->etype) { + int i; + MPI_Offset nelem; + + nelem = 1; + for (i = 0; i < varp->ndim; i++) { nelem *= count[i]; } + + xbuf = (char *)NCI_Malloc (nelem * varp->esize); + CHK_PTR (xbuf) + err = ncchkioiconvert (cbuf, xbuf, buftype, varp->etype, nelem); + if (err != NC_NOERR) return err; + } else { + xbuf = cbuf; + } + + err = ncchkioi_iput_var (ncchkp, varid, start, count, stride, xbuf, buf, reqid); + if (reqid != NULL) { (*reqid) *= 2; } + + if (cbuf != buf && cbuf != xbuf) NCI_Free (cbuf); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_IPUT) + +err_out:; + return err; +} + +int ncchkio_buffer_attach (void *ncdp, MPI_Offset bufsize) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->buffer_attach (ncchkp->ncp, bufsize); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_buffer_detach (void *ncdp) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + err = ncchkp->driver->buffer_detach (ncchkp->ncp); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_bput_var (void *ncdp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid, + int reqMode) { + int err=NC_NOERR; + int i; + void *cbuf = (void *)buf; + void *xbuf; + MPI_Offset nelem; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_START (NC_CHK_TIMER_IPUT) + + if (reqMode == NC_REQ_INDEP) { DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); } + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + if (varp->varkind == NC_CHK_VAR_RAW) { + err = ncchkp->driver->bput_var (ncchkp->ncp, varp->varid, start, count, stride, imap, buf, + bufcount, buftype, reqid, reqMode); + if (err != NC_NOERR) { return err; } + if (reqid != NULL) { *reqid = *reqid * 2 + 1; } + return NC_NOERR; + } + + if (varp->isrec) { + if (ncchkp->recsize < start[0] + count[0]) { ncchkp->recsize = start[0] + count[0]; } + } + + if (imap != NULL || bufcount != -1) { + /* pack buf to cbuf -------------------------------------------------*/ + /* If called from a true varm API or a flexible API, ncmpii_pack() + * packs user buf into a contiguous cbuf (need to be freed later). + * Otherwise, cbuf is simply set to buf. ncmpii_pack() also returns + * etype (MPI primitive datatype in buftype), and nelems (number of + * etypes in buftype * bufcount) + */ + int ndims; + MPI_Offset nelems; + MPI_Datatype etype; + + err = ncchkp->driver->inq_var (ncchkp->ncp, varid, NULL, NULL, &ndims, NULL, NULL, NULL, + NULL, NULL); + if (err != NC_NOERR) goto err_check; + + err = ncmpii_pack (ndims, count, imap, (void *)buf, bufcount, buftype, &nelems, &etype, + &cbuf); + if (err != NC_NOERR) goto err_check; + + imap = NULL; + bufcount = (nelems == 0) ? 0 : -1; /* make it a high-level API */ + buftype = etype; /* an MPI primitive type */ + } + +err_check: + if (err != NC_NOERR) { + if (reqMode & NC_REQ_INDEP) return err; + reqMode |= NC_REQ_ZERO; /* participate collective call */ + } + + nelem = 1; + for (i = 0; i < varp->ndim; i++) { nelem *= count[i]; } + + xbuf = (char *)NCI_Malloc (nelem * varp->esize); + CHK_PTR (xbuf) + + if (buftype != varp->etype) { + err = ncchkioiconvert (cbuf, xbuf, buftype, varp->etype, nelem); + if (err != NC_NOERR) return err; + } else { + memcpy (xbuf, cbuf, varp->esize * nelem); + } + + err = ncchkioi_iput_var (ncchkp, varid, start, count, stride, xbuf, buf, reqid); + CHK_ERR + if (reqid != NULL) { (*reqid) *= 2; } + + if (cbuf != buf) NCI_Free (cbuf); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_IPUT) + +err_out:; + return err; +} +int ncchkio_get_varn (void *ncdp, + int varid, + int num, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode) { + int err=NC_NOERR, ret; + int i; + void *cbuf = (void *)buf; + void *xbuf = (void *)buf; + MPI_Offset nelem; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET) + + if (reqMode == NC_REQ_INDEP) { DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); } + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + if (varp->varkind == NC_CHK_VAR_RAW) { + return ncchkp->driver->get_varn (ncchkp->ncp, varp->varid, num, starts, counts, buf, + bufcount, buftype, reqMode); + } + + if (ncchkp->delay_init && (varp->chunkdim == NULL)) { + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_META) + + err = ncchkioi_var_init (ncchkp, varp, num, (MPI_Offset **)starts, (MPI_Offset **)counts); + CHK_ERR + + if (!(varp->isnew)) { + ret = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_metaoffset", + &(varp->metaoff), MPI_LONG_LONG); + if (ret == NC_NOERR) { // Read index table + MPI_Status status; + + // Set file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, + ((NC *)(ncchkp->ncp))->begin_var, MPI_BYTE, MPI_BYTE, "native", + MPI_INFO_NULL); + // Read data + CHK_ERR_READ_AT_ALL ( + ((NC *)(ncchkp->ncp))->collective_fh, varp->metaoff, varp->chunk_index, + sizeof (NC_chk_chunk_index_entry) * varp->nchunk, MPI_BYTE, &status); + } else { + varp->metaoff = -1; + memset (varp->chunk_index, 0, + sizeof (NC_chk_chunk_index_entry) * (varp->nchunk + 1)); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_META) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET) + } + + if (varp->isrec && (varp->dimsize[0] < ncchkp->recsize)) { + for (i = 0; i < num; i++) { + if (starts[i][0] + counts[i][0] >= varp->dimsize[0]) { + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_RESIZE) + + err = ncchkioi_var_resize (ncchkp, varp); + CHK_ERR + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_RESIZE) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET) + + break; + } + } + } + + if (buftype != varp->etype) { + int j; + MPI_Offset tmp; + + nelem = 0; + for (i = 0; i < num; i++) { + tmp = 1; + for (j = 0; j < varp->ndim; j++) { tmp *= counts[i][j]; } + nelem += tmp; + } + + xbuf = (char *)NCI_Malloc (nelem * varp->esize); + CHK_PTR (xbuf) + } else { + xbuf = cbuf; + } + + err = ncchkioi_get_varn (ncchkp, varp, num, starts, counts, xbuf); + if (err != NC_NOERR) return err; + + if (buftype != varp->etype) { + err = ncchkioiconvert (xbuf, cbuf, varp->etype, buftype, nelem); + if (err != NC_NOERR) return err; + } + + if (xbuf != cbuf) NCI_Free (xbuf); + if (cbuf != buf) NCI_Free (cbuf); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + +err_out:; + return err; +} + +int ncchkio_put_varn (void *ncdp, + int varid, + int num, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode) { + int err=NC_NOERR, ret; + void *cbuf = (void *)buf; + void *xbuf = (void *)buf; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT) + + if (reqMode == NC_REQ_INDEP) { DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); } + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + if (varp->varkind == NC_CHK_VAR_RAW) { + return ncchkp->driver->put_varn (ncchkp->ncp, varp->varid, num, starts, counts, buf, + bufcount, buftype, reqMode); + } + + if (ncchkp->delay_init && (varp->chunkdim == NULL)) { + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_PUT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_META) + + err = ncchkioi_var_init (ncchkp, varp, num, (MPI_Offset **)starts, (MPI_Offset **)counts); + CHK_ERR + if (!(varp->isnew)) { + ret = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_metaoffset", + &(varp->metaoff), MPI_LONG_LONG); + if (ret == NC_NOERR) { // Read index table + MPI_Status status; + + // Set file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, + ((NC *)(ncchkp->ncp))->begin_var, MPI_BYTE, MPI_BYTE, "native", + MPI_INFO_NULL); + // Read data + CHK_ERR_READ_AT_ALL ( + ((NC *)(ncchkp->ncp))->collective_fh, varp->metaoff, varp->chunk_index, + sizeof (NC_chk_chunk_index_entry) * varp->nchunk, MPI_BYTE, &status); + } else { + varp->metaoff = -1; + memset (varp->chunk_index, 0, + sizeof (NC_chk_chunk_index_entry) * (varp->nchunk + 1)); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_META) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT) + } + + if (buftype != varp->etype) { + int i, j; + MPI_Offset nelem, tmp; + + nelem = 0; + for (i = 0; i < num; i++) { + tmp = 1; + for (j = 0; j < varp->ndim; j++) { tmp *= counts[i][j]; } + nelem += tmp; + } + + xbuf = (char *)NCI_Malloc (nelem * varp->esize); + CHK_PTR (xbuf) + err = ncchkioiconvert (cbuf, xbuf, buftype, varp->etype, nelem); + if (err != NC_NOERR) return err; + } else { + xbuf = cbuf; + } + + err = ncchkioi_put_varn (ncchkp, varp, num, starts, counts, xbuf); + if (err != NC_NOERR) return err; + +err_out:; + if (xbuf != cbuf) NCI_Free (xbuf); + if (cbuf != buf) NCI_Free (cbuf); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + return err; +} + +int ncchkio_iget_varn (void *ncdp, + int varid, + int num, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid, + int reqMode) { + int err=NC_NOERR; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_START (NC_CHK_TIMER_IGET) + + if (reqMode == NC_REQ_INDEP) { DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); } + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + if (varp->varkind == NC_CHK_VAR_RAW) { + err = ncchkp->driver->iget_varn (ncchkp->ncp, varp->varid, num, starts, counts, buf, + bufcount, buftype, reqid, reqMode); + if (err != NC_NOERR) { return err; } + if (reqid != NULL) { *reqid = *reqid * 2 + 1; } + return NC_NOERR; + } + + err = ncchkioi_iget_varn (ncchkp, varid, num, starts, counts, buf, bufcount, buftype, reqid); + if (err != NC_NOERR) return err; + if (reqid != NULL) { (*reqid) *= 2; } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_IGET) + + return NC_NOERR; +} + +int ncchkio_iput_varn (void *ncdp, + int varid, + int num, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid, + int reqMode) { + int err=NC_NOERR; + int i; + void *cbuf = (void *)buf; + void *xbuf = (void *)buf; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_START (NC_CHK_TIMER_IPUT) + + if (reqMode == NC_REQ_INDEP) { DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); } + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + if (varp->isrec) { + for (i = 0; i < num; i++) { + if (ncchkp->recsize < starts[i][0] + counts[i][0]) { + ncchkp->recsize = starts[i][0] + counts[i][0]; + } + } + } + + if (varp->varkind == NC_CHK_VAR_RAW) { + err = ncchkp->driver->iput_varn (ncchkp->ncp, varp->varid, num, starts, counts, buf, + bufcount, buftype, reqid, reqMode); + if (err != NC_NOERR) { return err; } + if (reqid != NULL) { *reqid = *reqid * 2 + 1; } + return NC_NOERR; + } + + if (buftype != varp->etype) { + int j; + MPI_Offset nelem, tmp; + + nelem = 0; + for (i = 0; i < num; i++) { + tmp = 1; + for (j = 0; j < varp->ndim; j++) { tmp *= counts[i][j]; } + nelem += tmp; + } + + xbuf = (char *)NCI_Malloc (nelem * varp->esize); + err = ncchkioiconvert (cbuf, xbuf, buftype, varp->etype, nelem); + if (err != NC_NOERR) return err; + } else { + xbuf = cbuf; + } + + err = ncchkioi_iput_varn (ncchkp, varid, num, starts, counts, xbuf, buf, reqid); + if (err != NC_NOERR) return err; + if (reqid != NULL) { (*reqid) *= 2; } + + if (cbuf != buf && cbuf != xbuf) NCI_Free (cbuf); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_IPUT) + + return NC_NOERR; +} + +int ncchkio_bput_varn (void *ncdp, + int varid, + int num, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid, + int reqMode) { + int err=NC_NOERR; + int i, j; + void *cbuf = (void *)buf; + void *xbuf = (void *)buf; + MPI_Offset nelem, tmp; + NC_chk_var *varp; + NC_chk *ncchkp = (NC_chk *)ncdp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_START (NC_CHK_TIMER_IPUT) + + if (reqMode == NC_REQ_INDEP) { DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); } + + if (varid < 0 || varid >= ncchkp->vars.cnt) { DEBUG_RETURN_ERROR (NC_EINVAL); } + varp = ncchkp->vars.data + varid; + + if (varp->isrec) { + for (i = 0; i < num; i++) { + if (ncchkp->recsize < starts[i][0] + counts[i][0]) { + ncchkp->recsize = starts[i][0] + counts[i][0]; + } + } + } + + if (varp->varkind == NC_CHK_VAR_RAW) { + err = ncchkp->driver->bput_varn (ncchkp->ncp, varp->varid, num, starts, counts, buf, + bufcount, buftype, reqid, reqMode); + if (err != NC_NOERR) { return err; } + if (reqid != NULL) { *reqid = *reqid * 2 + 1; } + return NC_NOERR; + } + + nelem = 0; + for (i = 0; i < num; i++) { + tmp = 1; + for (j = 0; j < varp->ndim; j++) { tmp *= counts[i][j]; } + nelem += tmp; + } + xbuf = (char *)NCI_Malloc (nelem * varp->esize); + + if (buftype != varp->etype) { + err = ncchkioiconvert (cbuf, xbuf, buftype, varp->etype, nelem); + if (err != NC_NOERR) return err; + } else { + memcpy (xbuf, cbuf, nelem * varp->esize); + } + + err = ncchkioi_iput_varn (ncchkp, varid, num, starts, counts, xbuf, buf, reqid); + if (err != NC_NOERR) return err; + if (reqid != NULL) { (*reqid) *= 2; } + + if (cbuf != buf && cbuf != xbuf) NCI_Free (cbuf); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_TOTAL) + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_IPUT) + + return NC_NOERR; +} + +int ncchkio_get_vard (void *ncdp, + int varid, + MPI_Datatype filetype, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); + + err = ncchkp->driver->get_vard (ncchkp->ncp, varid, filetype, buf, bufcount, buftype, reqMode); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} + +int ncchkio_put_vard (void *ncdp, + int varid, + MPI_Datatype filetype, + const void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode) { + int err=NC_NOERR; + NC_chk *ncchkp = (NC_chk *)ncdp; + + DEBUG_RETURN_ERROR (NC_ENOTSUPPORT); + + err = ncchkp->driver->put_vard (ncchkp->ncp, varid, filetype, buf, bufcount, buftype, reqMode); + if (err != NC_NOERR) return err; + + return NC_NOERR; +} diff --git a/src/drivers/ncchunkio/ncchkioi_cache.c b/src/drivers/ncchunkio/ncchkioi_cache.c new file mode 100644 index 0000000000..89b9f360d9 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_cache.c @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +static int ncchkioi_cache_evict (NC_chk *ncchkp) { + int err=NC_NOERR; + NC_chk_cache *target; + + target = ncchkp->cache_head; + + if (target == NULL || target->serial >= ncchkp->cache_serial) { + printf ("Rank %d: Cache limit exceeded\n", ncchkp->rank); + RET_ERR(NC_ENOMEM) + } + + // Remove from list + ncchkp->cache_head = target->next; + if (ncchkp->cache_tail == target) { ncchkp->cache_tail = NULL; } + + ncchkp->cache_used -= target->bsize; // Return budget + ncchkp->cache_head = target->next; + + *(target->ref) = NULL; // Mark as evicted + NCI_Free (target->buf); + NCI_Free (target); + +err_out:; + return err; +} + +int ncchkioi_cache_alloc (NC_chk *ncchkp, MPI_Offset size, NC_chk_cache **ref) { + int err=NC_NOERR; + NC_chk_cache *target; + + // Evict cached data if no space + if (ncchkp->cache_limit > 0) { + while (ncchkp->cache_used + size > ncchkp->cache_limit) { + err = ncchkioi_cache_evict (ncchkp); + CHK_ERR + } + } + ncchkp->cache_used += size; + + // Prepare cache entry + target = (NC_chk_cache *)NCI_Malloc (sizeof (NC_chk_cache)); + if (target == NULL) { DEBUG_RETURN_ERROR (NC_ENOMEM) } + target->bsize = size; + target->next = NULL; + target->prev = ncchkp->cache_tail; + target->ref = ref; + target->serial = ncchkp->cache_serial; + target->buf = NCI_Malloc (size); +#ifdef PNETCDF_DEBUG + memset (target->buf, 0, size); +#endif + + // Insert to list tail + if (ncchkp->cache_tail != NULL) { + ncchkp->cache_tail->next = target; + } else { + ncchkp->cache_head = target; + } + ncchkp->cache_tail = target; + + // Assign reference + *ref = target; + +err_out:; + return err; +} + +void ncchkioi_cache_visit (NC_chk *ncchkp, NC_chk_cache *target) { + if (target != ncchkp->cache_tail) { + // Remove from list + if (target->prev != NULL) { target->prev->next = target->next; } + if (target->next != NULL) { target->next->prev = target->prev; } + + // Insert to list tail + target->next = NULL; + target->prev = ncchkp->cache_tail; + ncchkp->cache_tail->next = target; + ncchkp->cache_tail = target; + } +} + +void ncchkioi_cache_free (NC_chk *ncchkp) { + NC_chk_cache *pre, *cur; + + cur = ncchkp->cache_head; + while (cur != NULL) { + pre = cur; + cur = cur->next; + NCI_Free (pre->buf); + NCI_Free (pre); + } +} diff --git a/src/drivers/ncchunkio/ncchkioi_chunk.c b/src/drivers/ncchunkio/ncchkioi_chunk.c new file mode 100644 index 0000000000..10cb458919 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_chunk.c @@ -0,0 +1,162 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include "ncchkio_internal.h" + + +#define min(a,b) (((a)<(b))?(a):(b)) +#define max(a,b) (((a)>(b))?(a):(b)) + +MPI_Offset get_chunk_overlap(NC_chk_var *varp, MPI_Offset* cord, const MPI_Offset *start, const MPI_Offset *count, MPI_Offset *ostart, MPI_Offset *ocount){ + int i; + MPI_Offset ret = varp->esize; + + for(i = 0; i < varp->ndim; i++){ + ostart[i] = max(start[i], cord[i]); + ocount[i] = min(start[i] + count[i], cord[i] + varp->chunkdim[i]) - ostart[i]; + if (ocount[i] <= 0){ + ocount[i] = 0; + } + ret *= ocount[i]; + } + + return ret; +} + +int get_chunk_id(NC_chk_var *varp, MPI_Offset *cord){ + int i, ret; + + ret = (int)(cord[0]) / varp->chunkdim[0]; + for(i = 1; i < varp->ndim; i++){ + ret = ret * varp->nchunks[i] + (int)(cord[i]) / varp->chunkdim[i]; + } + + return ret; +} + +int get_chunk_itr(NC_chk_var *varp, int idx, MPI_Offset* cord){ + int i; + + for(i = varp->ndim - 1; i >= 0; i--){ + cord[i] = (idx % varp->nchunks[i]) * varp->chunkdim[i]; + idx /= varp->nchunks[i]; + } + + return 0; +} + +int ncchkioi_chunk_itr_init(NC_chk_var *varp, const MPI_Offset *start, const MPI_Offset *count, MPI_Offset *citr, int *cid){ + int i; + + *cid = 0; + for(i = 0; i < varp->ndim; i++){ + citr[i] = start[i] - (start[i] % varp->chunkdim[i]); + *cid += citr[i] / varp->chunkdim[i] * varp->cidsteps[i]; + } + + return NC_NOERR; +} + +int ncchkioi_chunk_itr_next(NC_chk_var *varp, const MPI_Offset *start, const MPI_Offset *count, MPI_Offset *citr, int *cid){ + int i, j; + + i = varp->ndim - 1; + citr[i] += varp->chunkdim[i]; + (*cid)++; + for(; i > 0; i--){ + if (citr[i] >= start[i] + count[i]){ + citr[i - 1] += varp->chunkdim[i - 1]; + j = citr[i]; + citr[i] = start[i] - (start[i] % varp->chunkdim[i]); + *cid += varp->cidsteps[i - 1] - varp->cidsteps[i] * (j - citr[i]) / varp->chunkdim[i]; + } + else{ + break; + } + } + + if (citr[0] >= start[0] + count[0]){ + return 0; + } + + return 1; +} + +int ncchkioi_chunk_itr_init_ex(NC_chk_var *varp, const MPI_Offset *start, const MPI_Offset *count, MPI_Offset *citr, int *cid, MPI_Offset *ostart, MPI_Offset *ocount){ + int i; + + *cid = 0; + for(i = 0; i < varp->ndim; i++){ + citr[i] = start[i] - (start[i] % varp->chunkdim[i]); + *cid += citr[i] / varp->chunkdim[i] * varp->cidsteps[i]; + ostart[i] = start[i]; + ocount[i] = min(count[i], citr[i] + varp->chunkdim[i] - ostart[i]); + } + + return NC_NOERR; +} + +int ncchkioi_chunk_itr_next_ex(NC_chk_var *varp, const MPI_Offset *start, const MPI_Offset *count, MPI_Offset *citr, int *cid, MPI_Offset *ostart, MPI_Offset *ocount){ + int i, j; + + i = varp->ndim - 1; + citr[i] += varp->chunkdim[i]; + + (*cid)++; + for(; i > 0; i--){ + if (citr[i] >= start[i] + count[i]){ + citr[i - 1] += varp->chunkdim[i - 1]; + ostart[i - 1] += ocount[i - 1]; + ocount[i - 1] = min(varp->chunkdim[i - 1], start[i - 1] + count[i - 1] - ostart[i - 1]); + j = citr[i]; + citr[i] = start[i] - (start[i] % varp->chunkdim[i]); + ostart[i] = start[i]; + ocount[i] = min(count[i], citr[i] + varp->chunkdim[i] - ostart[i]); + *cid += varp->cidsteps[i - 1] - varp->cidsteps[i] * (j - citr[i]) / varp->chunkdim[i]; + } + else{ + break; + } + } + + if (citr[0] >= start[0] + count[0]){ + return 0; + } + + if (i == varp->ndim - 1){ + ostart[i] += ocount[i]; + ocount[i] = min(varp->chunkdim[i], start[i] + count[i] - ostart[i]); + for (i++; i < varp->ndim; i++) { + ostart[i] = start[i]; + ocount[i] = min(count[i], citr[i] + varp->chunkdim[i] - ostart[i]); + } + } + + return 1; +} diff --git a/src/drivers/ncchunkio/ncchkioi_chunk_owner.c b/src/drivers/ncchunkio/ncchkioi_chunk_owner.c new file mode 100644 index 0000000000..7e0db237f9 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_chunk_owner.c @@ -0,0 +1,621 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +void ncchkioi_write_chunk_ocnt (NC_chk *ncchkp, NC_chk_var *varp, void *ocnt, size_t ocnt_size) { +#ifdef PNETCDF_PROFILING + { + int i, j; + char *pprefix = getenv ("PNETCDF_OWNER_PREFIX"); + + if (pprefix != NULL) { + if (ncchkp->rank == 0) { + void *ocnt_in; + int *cown; + MPI_Status stat; + FILE *pfile; + char fname[1024], ppath[1024]; + + ocnt_in = NCI_Malloc (ocnt_size * varp->nchunkrec); + cown = NCI_Malloc (sizeof (int) * varp->nchunkrec); + + strcpy (fname, ncchkp->path); + for (i = strlen (fname); i > 0; i--) { + if (fname[i] == '.') { + fname[i] = '\0'; + } else if (fname[i] == '\\' || fname[i] == '/') { + i++; + break; + } + } + sprintf (ppath, "%s%s_owner.csv", pprefix, fname + i); + pfile = fopen (ppath, "a"); + + fprintf (pfile, "Var:, %d\n", varp->varid); + fprintf (pfile, "Rank\\Chunk, "); + for (j = 0; j < varp->nchunkrec; j++) { fprintf (pfile, "%d, ", j); } + fprintf (pfile, "\nOwner, "); + for (j = 0; j < varp->nchunk; j++) { + fprintf (pfile, "%d, ", varp->chunk_owner[j]); + } + fprintf (pfile, "\n0, "); + if (ocnt_size == sizeof (MPI_Offset)) { + for (j = 0; j < varp->nchunkrec; j++) { + fprintf (pfile, "%lld, ", ((MPI_Offset *)ocnt)[j]); + } + } else { + for (j = 0; j < varp->nchunkrec; j++) { + fprintf (pfile, "%lld, ", ((ncchkioi_chunk_overlap_t *)ocnt)[j].osize); + } + } + fprintf (pfile, "\n"); + for (i = 1; i < ncchkp->np; i++) { + if (ocnt_size == sizeof (MPI_Offset)) { + MPI_Recv (ocnt_in, varp->nchunkrec, MPI_LONG_LONG, i, 0, ncchkp->comm, + &stat); + fprintf (pfile, "%d, ", i); + for (j = 0; j < varp->nchunkrec; j++) { + fprintf (pfile, "%lld, ", ((MPI_Offset *)ocnt_in)[j]); + } + } else { + MPI_Recv (ocnt_in, varp->nchunkrec, ncchkp->overlaptype, i, 0, ncchkp->comm, + &stat); + fprintf (pfile, "%d, ", i); + for (j = 0; j < varp->nchunkrec; j++) { + fprintf (pfile, "%lld, ", + ((ncchkioi_chunk_overlap_t *)ocnt_in)[j].osize); + } + } + fprintf (pfile, "\n"); + + MPI_Recv (cown, varp->nchunkrec, MPI_INT, i, 0, ncchkp->comm, &stat); + for (j = 0; j < varp->nchunkrec; j++) { + if (cown[j] != varp->chunk_owner[j]) { + printf ("Warning: cown[%d][%d] on rank %d = %d, != %d\n", varp->varid, j, + i, cown[j], varp->chunk_owner[j]); + } + } + } + + fclose (pfile); + NCI_Free (ocnt_in); + NCI_Free (cown); + } else { + if (ocnt_size == sizeof (MPI_Offset)) { + MPI_Send (ocnt, varp->nchunkrec, MPI_LONG_LONG, 0, 0, ncchkp->comm); + } else { + MPI_Send (ocnt, varp->nchunkrec, ncchkp->overlaptype, 0, 0, ncchkp->comm); + } + MPI_Send (varp->chunk_owner, varp->nchunkrec, MPI_INT, 0, 0, ncchkp->comm); + } + } + } +#endif +} + +void max_osize_rank_op (void *inp, void *inoutp, int *len, MPI_Datatype *dptr) { + int i; + ncchkioi_chunk_overlap_t *in = (ncchkioi_chunk_overlap_t *)inp; + ncchkioi_chunk_overlap_t *inout = (ncchkioi_chunk_overlap_t *)inoutp; + + for (i = 0; i < *len; i++) { + if (in->osize > inout->osize) { + inout->osize = in->osize; + inout->rank = in->rank; + } else if ((in->osize == inout->osize) && (in->rank < inout->rank)) { + inout->osize = in->osize; + inout->rank = in->rank; + } + in++; + inout++; + } +} + +int ncchkioi_calc_chunk_owner ( + NC_chk *ncchkp, NC_chk_var *varp, int nreq, MPI_Offset **starts, MPI_Offset **counts) { + return ncchkioi_calc_chunk_owner_reduce (ncchkp, varp, nreq, starts, counts); +} + +static inline void ncchkioi_rec_chunk_overlap (MPI_Offset *ostart, + MPI_Offset *osize, + MPI_Offset *citr, + NC_chk_var *varp, + MPI_Offset *ocnt, + NC_chk_req *reqp) { + int i; + int req; + int cid; // Chunk iterator + MPI_Offset overlapsize; + + for (req = 0; req < reqp->nreq; req++) { + ncchkioi_chunk_itr_init_ex (varp, reqp->starts[req], reqp->counts[req], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + if (cid < varp->nchunkrec) { // Count only first record + // Count overlap + overlapsize = 1; + for (i = 0; i < varp->ndim; i++) { overlapsize *= osize[i]; } + ocnt[cid] += (double)overlapsize; + if (ocnt[cid] > varp->chunksize) { ocnt[cid] = (double)varp->chunksize; } + } + } while (ncchkioi_chunk_itr_next_ex (varp, reqp->starts[req], reqp->counts[req], citr, &cid, + ostart, osize)); + } +} + +int ncchkioi_calc_chunk_overlap (NC_chk *ncchkp, + NC_chk_var *varp, + int nreq, + MPI_Offset **starts, + MPI_Offset **counts, + ncchkioi_chunk_overlap_t *ocnt) { + int err=NC_NOERR; + int i, j, k; + int cid; // Chunk iterator + int req; + MPI_Offset overlapsize; + MPI_Offset *ostart, *osize; + MPI_Offset *citr; // Bounding box for chunks overlapping my own write region + + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * 3); + CHK_PTR (ostart) + osize = ostart + varp->ndim; + citr = osize + varp->ndim; + + memset (ocnt, 0, sizeof (ncchkioi_chunk_overlap_t) * varp->nchunkrec); + + // Count overlapsize of each request + if (varp->isrec) { + for (req = 0; req < nreq; req++) { + ncchkioi_chunk_itr_init_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + if (cid < varp->nchunkrec) { // Count only first record + // Count overlap + overlapsize = 1; + for (i = 0; i < varp->ndim; i++) { overlapsize *= osize[i]; } + ocnt[cid].osize += (double)overlapsize; + if (ocnt[cid].osize > varp->chunksize) { + ocnt[cid].osize = (double)varp->chunksize; + } + } + } while (ncchkioi_chunk_itr_next_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize)); + } + } else { + for (req = 0; req < nreq; req++) { + ncchkioi_chunk_itr_init_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Count overlap + overlapsize = 1; + for (i = 0; i < varp->ndim; i++) { overlapsize *= osize[i]; } + ocnt[cid].osize += overlapsize; + if (ocnt[cid].osize > varp->chunksize) { ocnt[cid].osize = varp->chunksize; } + } while (ncchkioi_chunk_itr_next_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize)); + } + } + + // First 16 bit used as noise + for (i = 0; i < varp->nchunkrec; i++) { + ocnt[i].rank = ncchkp->rank; + ocnt[i].osize *= varp->esize; + ocnt[i].osize <<= 16; + } + + // Noise to break tie + j = (ncchkp->rank - ncchkp->assigned_chunks) % ncchkp->np; + if (j < 0) j += ncchkp->np; + if (j > varp->nchunkrec) { j = varp->nchunkrec; } + k = ncchkp->np - 1; // noise from 0 ~ np-1 + for (i = j; i < varp->nchunkrec; i++) { + ocnt[i].osize += k; + k--; + if (k < 0) { k += ncchkp->np; } + } + for (i = 0; i < j; i++) { + ocnt[i].osize += k; + k--; + if (k < 0) { k += ncchkp->np; } + } + ncchkp->assigned_chunks += varp->nchunk; + +err_out:; + NCI_Free (ostart); + return err; +} + +void ncchkioi_assign_chunk_owner (NC_chk *ncchkp, + NC_chk_var *varp, + ncchkioi_chunk_overlap_t *ocnt) { + int i, j; + for (i = 0; i < varp->nchunkrec; i++) { varp->chunk_owner[i] = ocnt[i].rank; } + if (varp->isrec) { + for (i = varp->nchunkrec; i < varp->nchunk; i += varp->nchunkrec) { + memcpy (varp->chunk_owner + i, varp->chunk_owner, sizeof (int) * varp->nchunkrec); + } + } + + // Build skip list of my own chunks + if (varp->nchunk > 0) { + varp->nmychunkrec = 0; + for (j = 0; j < varp->nchunkrec; j++) { + if (varp->chunk_owner[j] == ncchkp->rank) { varp->nmychunkrec++; } + } + varp->nmychunk = varp->nmychunkrec * varp->nrec; + varp->mychunks = (int *)NCI_Realloc (varp->mychunks, sizeof (int) * varp->nmychunkrec * varp->nrecalloc); + varp->nmychunk = 0; + for (j = 0; j < varp->nchunk; j++) { + if (varp->chunk_owner[j] == ncchkp->rank) { + varp->mychunks[varp->nmychunk++] = j; + if (varp->isnew) { // Only apply to new var, old var will be read when it is + // needed + // varp->chunk_cache[j] = (void*)NCI_Malloc(varp->chunksize); // Allocate + // buffer for blocks we own + // memset(varp->chunk_cache[j], 0 , varp->chunksize); + } + } + } + } else { + varp->nmychunk = varp->nmychunkrec = 0; + varp->mychunks = NULL; + } + + // Update global chunk count + ncchkp->nmychunks += (MPI_Offset) (varp->nmychunk); + ncchkp->cown_size += + (MPI_Offset) ((double)((MPI_Offset) (varp->nmychunk) * (MPI_Offset) (varp->chunksize)) * + ncchkp->cown_ratio); +} + +int ncchkioi_sync_ocnt_reduce (NC_chk *ncchkp, + int nchunk, + ncchkioi_chunk_overlap_t *ocnt, + ncchkioi_chunk_overlap_t *ocnt_all, + MPI_Request *req) { + int err=NC_NOERR; + int i; + + // Construct MPI type for overlap if not already constructed + if (ncchkp->overlaptype == MPI_DATATYPE_NULL) { + err = MPI_Type_contiguous (sizeof (ncchkioi_chunk_overlap_t), MPI_BYTE, + &(ncchkp->overlaptype)); + CHK_MPIERR + err = MPI_Type_commit (&(ncchkp->overlaptype)); + CHK_MPIERR + } + + if (ncchkp->max_cown_op == MPI_OP_NULL) { + err = MPI_Op_create (max_osize_rank_op, 1, &(ncchkp->max_cown_op)); + CHK_MPIERR + } + + // Apply owner penalty + for (i = 0; i < nchunk; i++) { + ocnt[i].osize -= ncchkp->cown_size << 16; // Penality for load ballance, set at 1/16 + } + + if (req) { + CHK_ERR_IALLREDUCE (ocnt, ocnt_all, nchunk, ncchkp->overlaptype, ncchkp->max_cown_op, + ncchkp->comm, req); + } else { + CHK_ERR_ALLREDUCE (ocnt, ocnt_all, nchunk, ncchkp->overlaptype, ncchkp->max_cown_op, + ncchkp->comm); + } + +err_out:; + return err; +} + +int ncchkioi_sync_ocnt_gather (NC_chk *ncchkp, + int nchunk, + ncchkioi_chunk_overlap_t *ocnt, + MPI_Offset **ocnt_all, + MPI_Request *req) { + int err=NC_NOERR; + + // Construct MPI type for overlap if not already constructed + if (ncchkp->overlaptype == MPI_DATATYPE_NULL) { + MPI_Datatype tmptype; + + err = MPI_Type_contiguous (sizeof (MPI_Offset), MPI_BYTE, &tmptype); + CHK_MPIERR + err = MPI_Type_commit (&tmptype); + CHK_MPIERR + err = MPI_Type_create_resized (tmptype, 0, sizeof (ncchkioi_chunk_overlap_t), + &(ncchkp->overlaptype)); + CHK_MPIERR + err = MPI_Type_commit (&(ncchkp->overlaptype)); + CHK_MPIERR + err = MPI_Type_free (&tmptype); + } + + if (req) { + err = MPI_Igather (ocnt, nchunk, ncchkp->overlaptype, ocnt_all[0], nchunk, MPI_LONG_LONG, 0, + ncchkp->comm, req); + } else { + err = MPI_Gather (ocnt, nchunk, ncchkp->overlaptype, ocnt_all[0], nchunk, MPI_LONG_LONG, 0, + ncchkp->comm); + } + CHK_MPIERR + +err_out:; + return err; +} + +int ncchkioi_sync_ocnt_gather_bcast (NC_chk *ncchkp, + NC_chk_var *varp, + MPI_Offset **ocnt_in, + ncchkioi_chunk_overlap_t *ocnt_all, + MPI_Request *req) { + int err=NC_NOERR; + int i, j, k; + MPI_Offset *cown_size; + + if (ncchkp->rank == 0) { + cown_size = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * ncchkp->np); + memset (cown_size, 0, sizeof (MPI_Offset) * ncchkp->np); + for (i = 0; i < varp->nchunkrec; i++) { + ocnt_all[i].rank = 0; + ocnt_all[i].osize = ocnt_in[0][i]; + k = 0; + for (j = 1; j < ncchkp->np; j++) { + if (ocnt_in[j][i] - cown_size[j] > ocnt_in[k][i] - cown_size[k]) { k = j; } + } + cown_size[k] += + (MPI_Offset) ((double)(varp->chunksize) * ncchkp->cown_ratio) * varp->nrec; + ocnt_all[i].rank = k; + ocnt_all[i].osize = ocnt_in[i][k]; + } + } + + if (req) { + err = MPI_Ibcast (ocnt_all, varp->nchunkrec, ncchkp->overlaptype, 0, ncchkp->comm, req); + } else { + err = MPI_Bcast (ocnt_all, varp->nchunkrec, ncchkp->overlaptype, 0, ncchkp->comm); + } + CHK_MPIERR + +err_out:; + return err; +} + +int ncchkioi_calc_chunk_owner_reduce ( + NC_chk *ncchkp, NC_chk_var *varp, int nreq, MPI_Offset **starts, MPI_Offset **counts) { + int err=NC_NOERR; + ncchkioi_chunk_overlap_t *ocnt, *ocnt_all; + + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_COWN) + + ocnt = (ncchkioi_chunk_overlap_t *)NCI_Malloc (sizeof (ncchkioi_chunk_overlap_t) * + varp->nchunkrec * 2); + CHK_PTR (ocnt) + ocnt_all = ocnt + varp->nchunkrec; + + err = ncchkioi_calc_chunk_overlap (ncchkp, varp, nreq, starts, counts, ocnt); + CHK_ERR + + if (ncchkp->exact_cown) { + // err = ncchkioi_sync_ocnt_gather (ncchkp, varp->nchunkrec, ocnt, ocnt_all, NULL); + // CHK_ERR + RET_ERR (NC_ENOTSUPPORT) + } else { + err = ncchkioi_sync_ocnt_reduce (ncchkp, varp->nchunkrec, ocnt, ocnt_all, NULL); + CHK_ERR + } + + ncchkioi_assign_chunk_owner (ncchkp, varp, ocnt_all); + + ncchkioi_write_chunk_ocnt (ncchkp, varp, ocnt, sizeof (ncchkioi_chunk_overlap_t)); + + NCI_Free (ocnt); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_COWN) + +err_out:; + return err; +} + +static inline int ncchkioi_reduce_max_csize_n ( + NC_chk *ncchkp, int nvar, NC_chk_var **varps, MPI_Offset **ocnts, int **cowns) { + int err=NC_NOERR; + int i, j, k, v; + int nchunk; + MPI_Offset **ocnts_all[2]; + MPI_Offset *cown_size; + MPI_Offset *ocnt, **ocnt_all; + int *cown; + NC_chk_var *varp; + MPI_Request req; + MPI_Request *bcast_reqs; + MPI_Status stat; + + bcast_reqs = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nvar); + CHK_PTR (bcast_reqs) + + if (ncchkp->rank == 0) { + // Size owned by each process + cown_size = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * ncchkp->np); + CHK_PTR (cown_size) + memset (cown_size, 0, sizeof (MPI_Offset) * ncchkp->np); + + // Max #chunks across vars + nchunk = 0; + for (v = 0; v < nvar; v++) { + varp = varps[v]; + if (varp->nchunkrec > nchunk) { nchunk = varp->nchunkrec; } + } + // Allocate 2 set of ocnts_all + ocnts_all[0] = (MPI_Offset **)NCI_Malloc (sizeof (MPI_Offset *) * ncchkp->np * 2); + CHK_PTR (ocnts_all[0]) + ocnts_all[1] = ocnts_all[0] + ncchkp->np; + + ocnts_all[0][0] = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * nchunk * ncchkp->np * 2); + CHK_PTR (ocnts_all[0][0]) + ocnts_all[1][0] = ocnts_all[0][0] + nchunk * ncchkp->np; + for (i = 1; i < ncchkp->np; i++) { + ocnts_all[0][i] = ocnts_all[0][i - 1] + nchunk; + ocnts_all[1][i] = ocnts_all[1][i - 1] + nchunk; + } + + if (nvar > 0) { + varp = varps[0]; + ocnt = ocnts[0]; + ocnt_all = ocnts_all[0]; + err = MPI_Igather (ocnt, varp->nchunkrec, MPI_LONG_LONG, ocnt_all, varp->nchunkrec, + MPI_LONG_LONG, 0, ncchkp->comm, &req); + CHK_ERR + } + + for (v = 0; v < nvar; v++) { + cown = cowns[v]; + varp = varps[v]; + ocnt = ocnts[v]; + ocnt_all = ocnts_all[v & 1]; + + // Wait for comm + err = MPI_Wait (&req, &stat); + CHK_ERR + + // Post comm for next var + if (v < nvar - 1) { + err = MPI_Igather (ocnts[v + 1], varps[v + 1]->nchunkrec, MPI_LONG_LONG, + ocnts_all[(v + 1) & 1], varps[v + 1]->nchunkrec, MPI_LONG_LONG, + 0, ncchkp->comm, &req); + CHK_ERR + } + + // Compute max rank for this var + memset (cown, 0, sizeof (int) * varp->nchunkrec); + for (i = 0; i < varp->nchunk; i++) { + k = 0; + for (j = 1; j < ncchkp->np; j++) { + if (ocnt_all[j][i] - cown_size[j] > ocnt_all[k][i] - cown_size[k]) { k = j; } + } + cown_size[k] += + (MPI_Offset) ((double)(varp->chunksize) * ncchkp->cown_ratio) * varp->nrec; + cown[i] = k; + } + + // Bcast result + err = MPI_Ibcast (cown, varp->nchunkrec, MPI_INT, 0, ncchkp->comm, bcast_reqs + v); + CHK_ERR + } + } else { + for (v = 0; v < nvar; v++) { + // Send to rank 0 + err = MPI_Gather (ocnts[v], varps[v]->nchunkrec, MPI_LONG_LONG, NULL, + varps[v]->nchunkrec, MPI_LONG_LONG, 0, ncchkp->comm); + CHK_ERR + // Recv result + err = MPI_Ibcast (cowns[v], varps[v]->nchunkrec, MPI_INT, 0, ncchkp->comm, bcast_reqs + v); + CHK_ERR + } + } + + err = MPI_Waitall (nvar, bcast_reqs, MPI_STATUS_IGNORE); + CHK_ERR + + if (ncchkp->rank == 0) { + NCI_Free (cown_size); + NCI_Free (ocnts_all[0][0]); + NCI_Free (ocnts_all[0]); + } + NCI_Free (bcast_reqs); + +err_out:; + return err; +} + +int ncchkioi_calc_chunk_owner_gather ( + NC_chk *ncchkp, int nvar, NC_chk_var **varps, int nput, int *putreqs, int nget, int *getreqs) { + int err=NC_NOERR; + int i, j; + int nchunks; + MPI_Offset *ostart, *osize; + MPI_Offset *citr; // Bounding box for chunks overlapping my own write region + MPI_Offset **ocnts; + int **cowns; + NC_chk_var *varp; + int *idmap; + NC_chk_req *reqp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_COWN) + + // Allocate buffer for overlappinp structure + // Box of single overlap + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * ncchkp->max_ndim * 3); + osize = ostart + ncchkp->max_ndim; + citr = osize + ncchkp->max_ndim; + // Calculate total number of chunks to assign + idmap = (int *)NCI_Malloc (sizeof (int) * ncchkp->vars.cnt); + nchunks = 0; + for (i = 0; i < nvar; i++) { + idmap[varps[i]->varid] = i; + nchunks += varps[i]->nchunkrec; + } + // Overlap count struct + ocnts = (MPI_Offset **)NCI_Malloc (sizeof (MPI_Offset *) * nvar); + ocnts[0] = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * nchunks); + cowns = (int **)NCI_Malloc (sizeof (int *) * nvar); + cowns[0] = varps[0]->chunk_owner; + for (i = 1; i < nvar; i++) { + ocnts[i] = ocnts[i - 1] + varps[i - 1]->nchunkrec; + cowns[i] = varps[i]->chunk_owner; + } + + // Count overlapsize for each request + memset (ocnts[0], 0, sizeof (ncchkioi_chunk_overlap_t) * nchunks); + for (i = 0; i < nput; i++) { + reqp = ncchkp->putlist.reqs + putreqs[i]; + ncchkioi_rec_chunk_overlap (ostart, osize, citr, ncchkp->vars.data + reqp->varid, + ocnts[idmap[reqp->varid]], reqp); + } + for (i = 0; i < nget; i++) { + reqp = ncchkp->getlist.reqs + getreqs[i]; + ncchkioi_rec_chunk_overlap (ostart, osize, citr, ncchkp->vars.data + reqp->varid, + ocnts[idmap[reqp->varid]], reqp); + } + + // Calculate the max rank + ncchkioi_reduce_max_csize_n (ncchkp, nvar, varps, ocnts, cowns); + + // Copy owner to other records + for (i = 0; i < nvar; i++) { + varp = varps[i]; + if (varp->isrec) { + for (j = varp->nchunkrec; j < varp->nchunk; j += varp->nchunkrec) { + memcpy (varp->chunk_owner + j, varp->chunk_owner, sizeof (int) * varp->nchunkrec); + } + } + ncchkioi_write_chunk_ocnt (ncchkp, varp, ocnts[i], sizeof (MPI_Offset)); + } + + NCI_Free (ostart); + NCI_Free (ocnts[0]); + NCI_Free (ocnts); + NCI_Free (cowns); + NCI_Free (idmap); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_COWN) + + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_chunk_size.c b/src/drivers/ncchunkio/ncchkioi_chunk_size.c new file mode 100644 index 0000000000..03b610f5de --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_chunk_size.c @@ -0,0 +1,226 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +MPI_Offset gcd (MPI_Offset a, MPI_Offset b) { + if (b) { + while ((a %= b) && (b %= a)) + ; + } + return a + b; +} + +void gcd_reduce (long long *in, long long *inout, int *len, MPI_Datatype *dptr) { + int i; + + for (i = 0; i < *len; i++) { + if (*inout) + while (((*in) %= (*inout)) && ((*inout) %= (*in))) + ; + (*inout) = (*inout) + (*in); + in++; + inout++; + } +} + +int smaller (const void *a, const void *b) { return (*(MPI_Offset *)b - *(MPI_Offset *)a); } + +int ncchkioi_calc_chunk_size ( + NC_chk *ncchkp, NC_chk_var *varp, int nreq, MPI_Offset **starts, MPI_Offset **counts) { + int err=NC_NOERR; + int r, i, j; + int primes[] = {2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, + 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97}; + MPI_Offset *chunkdim; + MPI_Offset **candidates; + MPI_Offset chunksize; + MPI_Offset ub, lb; + MPI_Op gcd_op; + + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_CSIZE) + + // Upper and lower bound of reasonable chunk size + ub = (MPI_Offset)INT_MAX; // Max chunk size supported + lb = 1; + for (i = 0; i < varp->ndim; i++) { lb *= varp->dimsize[i]; } + lb /= (MPI_Offset)INT_MAX; // Max # chunks supported + if (lb < varp->ndim * 3) { // Metadata should not exceed data + lb = varp->ndim * 3; + } + if (lb < 1024) { // At least 1 KiB for efficiency + lb = 1024; + } + + /* Infer chunk size by reqs + * Assume the application is doing blocked division + * If we set chunk dim to gcd of all access boundary, no communication required + * If the pattern is completely randomized, the result will likely be 1 + */ + chunkdim = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim); + if (nreq > 0) { + candidates = (MPI_Offset **)NCI_Malloc (sizeof (MPI_Offset *) * varp->ndim); + candidates[0] = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * nreq); + for (i = 1; i < varp->ndim; i++) { candidates[i] = candidates[i - 1] + nreq; } + for (r = 0; r < nreq; r++) { + for (i = 0; i < varp->ndim; i++) { + candidates[i][r] = gcd (starts[r][i], counts[r][i]); + } + } + for (i = 0; i < varp->ndim; i++) { + qsort (candidates[i], nreq, sizeof (MPI_Offset), smaller); + chunkdim[i] = candidates[i][0]; + for (r = 1; r < nreq / 2; r++) { // Take the top 50% to drop out fragment writes + chunkdim[i] = gcd (chunkdim[i], candidates[i][r]); + } + } + } else { + for (i = 0; i < varp->ndim; i++) { + chunkdim[i] = 0; // We have no clue, listen to other processes + } + } + + // Global gcd + MPI_Op_create ((MPI_User_function *)gcd_reduce, 1, &gcd_op); + CHK_ERR_ALLREDUCE (MPI_IN_PLACE, chunkdim, varp->ndim, MPI_LONG_LONG, gcd_op, ncchkp->comm); + MPI_Op_free (&gcd_op); + + // If we have no clue accross processes, set chunk to max + for (i = 0; i < varp->ndim; i++) { + if (chunkdim[i] == 0) { chunkdim[i] = varp->dimsize[i]; } + } + + // At least 1 for rec dim + if (varp->isrec) { + if (chunkdim[0] == 0) { chunkdim[0] = 1; } + } + + // Check if chunk size is resonable (not too large or too small) + chunksize = 1; + for (i = 0; i < varp->ndim; i++) { chunksize *= chunkdim[i]; } + + // we only support chunk size up to INT_MAX + if (chunksize > ub) { + // Can we find perffect split using small prime numbers? + j = 0; + while ((j < 25) && (chunksize > ub)) { + r = 1; + for (i = 0; i < varp->ndim; i++) { // Spliting chunks along dims + if (chunkdim[i] % primes[j] == 0) { + chunkdim[i] /= primes[j]; + chunksize /= primes[j]; + r = 0; + } + } + if (r) { // No fit, try next prime + j++; + } + } + if (j >= 25) { // If not, we still need to split even we need to introduce communication + // overhead + for (i = 0; chunksize > ub; i++) { // Merging chunks + chunkdim[i % varp->ndim] /= 2; + chunksize /= 2; + } + } + } else if (chunksize < lb) { // Data smaller than metadata + int tmp; + int *heap; + int hsize; + + // Build heap of smallest chunk dim + heap = (int *)NCI_Malloc (sizeof (int) * varp->ndim); + for (i = 0; i < varp->ndim; i++) { + heap[i] = i; + j = i; + r = (j - 1) / 2; + while (j > 0 && chunkdim[heap[j]] < chunkdim[heap[r]]) { + tmp = heap[j]; + heap[j] = heap[r]; + heap[r] = tmp; + j = r; + r = (j - 1) / 2; + } + } + + hsize = varp->ndim; + while (chunksize < lb && hsize > 0) { + j = heap[0]; + if (chunkdim[j] * 2 <= varp->dimsize[j]) { // Merge chunk along smallest dim + chunkdim[j] *= 2; + chunksize *= 2; + } else { // Already reach var dim, remove from consideration + heap[0] = heap[--hsize]; + } + // Heapify + r = 0; + i = r * 2 + 1; + j = r * 2 + 2; + while (i < hsize) { + if ((j >= hsize) || (chunkdim[heap[i]] < chunkdim[heap[j]])) { + if (chunkdim[heap[i]] < chunkdim[heap[r]]) { + tmp = heap[r]; + heap[r] = heap[i]; + heap[i] = tmp; + r = i; + } else { + break; + } + } else { + if (chunkdim[heap[j]] < chunkdim[heap[r]]) { + tmp = heap[r]; + heap[r] = heap[j]; + heap[j] = tmp; + r = j; + } else { + break; + } + } + i = r * 2 + 1; + j = r * 2 + 2; + } + } + NCI_Free (heap); + + // Still not enough after doing everything, just set to entire var + if (chunksize < lb) { + memcpy (chunkdim, varp->dimsize, sizeof (MPI_Offset) * varp->ndim); + + // At least 1 for rec dim + if (varp->isrec) { + if (chunkdim[0] == 0) { chunkdim[0] = 1; } + } + } + } + + for (i = 0; i < varp->ndim; i++) { varp->chunkdim[i] = (int)chunkdim[i]; } + +err_out:; + + NCI_Free (chunkdim); + if (nreq > 0) { + NCI_Free (candidates[0]); + NCI_Free (candidates); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_CSIZE) + + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_convert.c b/src/drivers/ncchunkio/ncchkioi_convert.c new file mode 100644 index 0000000000..d7bb304bbb --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_convert.c @@ -0,0 +1,939 @@ +/* Do not edit this file. It is produced from the corresponding .m4 source */ +/* + * Copyright (C) 2018, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include + +#include +#include +#include +#include + +int ncchkioiconvert(void *inbuf, void *outbuf, MPI_Datatype intype, MPI_Datatype outtype, int N) { + int i; + + if (intype == MPI_BYTE){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_CHAR){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((char*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_SIGNED_CHAR){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((signed char*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_UNSIGNED_CHAR){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((unsigned char*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_SHORT){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((short*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_UNSIGNED_SHORT){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((unsigned short*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_INT){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((int*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_UNSIGNED){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((unsigned int*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_FLOAT){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((float*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_DOUBLE){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((double*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_LONG_LONG_INT){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((long long*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + if (intype == MPI_UNSIGNED_LONG_LONG){ + + if (outtype == MPI_BYTE) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_CHAR) { + for(i = 0; i < N; i++){ + ((char*)outbuf)[i] = (char)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((signed char*)outbuf)[i] = (signed char)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_CHAR) { + for(i = 0; i < N; i++){ + ((unsigned char*)outbuf)[i] = (unsigned char)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_SHORT) { + for(i = 0; i < N; i++){ + ((short*)outbuf)[i] = (short)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_SHORT) { + for(i = 0; i < N; i++){ + ((unsigned short*)outbuf)[i] = (unsigned short)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_INT) { + for(i = 0; i < N; i++){ + ((int*)outbuf)[i] = (int)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED) { + for(i = 0; i < N; i++){ + ((unsigned int*)outbuf)[i] = (unsigned int)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_FLOAT) { + for(i = 0; i < N; i++){ + ((float*)outbuf)[i] = (float)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_DOUBLE) { + for(i = 0; i < N; i++){ + ((double*)outbuf)[i] = (double)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_LONG_LONG_INT) { + for(i = 0; i < N; i++){ + ((long long*)outbuf)[i] = (long long)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + if (outtype == MPI_UNSIGNED_LONG_LONG) { + for(i = 0; i < N; i++){ + ((unsigned long long*)outbuf)[i] = (unsigned long long)(((unsigned long long*)inbuf)[i]); + } + return NC_NOERR; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + + return NC_NOERR; +} \ No newline at end of file diff --git a/src/drivers/ncchunkio/ncchkioi_convert.m4 b/src/drivers/ncchunkio/ncchkioi_convert.m4 new file mode 100644 index 0000000000..8f2fd9badf --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_convert.m4 @@ -0,0 +1,79 @@ +dnl Process this m4 file to produce 'C' language file. +dnl +dnl If you see this line, you can ignore the next one. +/* Do not edit this file. It is produced from the corresponding .m4 source */ +dnl +/* + * Copyright (C) 2018, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ +dnl +include(`foreach.m4')dnl +include(`utils.m4')dnl +dnl +define(`upcase', `translit(`$*', `a-z', `A-Z')')dnl +dnl +define(`SWOUT',dnl +`dnl + if (outtype == $1) { + for(i = 0; i < N; i++){ + (($2*)outbuf)[i] = ($2)((($3*)inbuf)[i]); + } + return NC_NOERR; + } +')dnl +dnl +define(`SWIN',dnl +`dnl + if (intype == $1){ + +foreach(`dt', (`(`MPI_BYTE', `char')', dnl + `(`MPI_CHAR', `char')', dnl + `(`MPI_SIGNED_CHAR', `signed char')', dnl + `(`MPI_UNSIGNED_CHAR', `unsigned char')', dnl + `(`MPI_SHORT', `short')', dnl + `(`MPI_UNSIGNED_SHORT', `unsigned short')', dnl + `(`MPI_INT', `int')', dnl + `(`MPI_UNSIGNED', `unsigned int')', dnl + `(`MPI_FLOAT', `float')', dnl + `(`MPI_DOUBLE', `double')', dnl + `(`MPI_LONG_LONG_INT', `long long')', dnl + `(`MPI_UNSIGNED_LONG_LONG', `unsigned long long')', dnl + ), `SWOUT(translit(dt, `()'), $2)')dnl + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + } +')dnl +dnl +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include + +#include +#include +#include +#include + +int ncchkioiconvert(void *inbuf, void *outbuf, MPI_Datatype intype, MPI_Datatype outtype, int N) { + int i; + +foreach(`dt', (`(`MPI_BYTE', `char')', dnl + `(`MPI_CHAR', `char')', dnl + `(`MPI_SIGNED_CHAR', `signed char')', dnl + `(`MPI_UNSIGNED_CHAR', `unsigned char')', dnl + `(`MPI_SHORT', `short')', dnl + `(`MPI_UNSIGNED_SHORT', `unsigned short')', dnl + `(`MPI_INT', `int')', dnl + `(`MPI_UNSIGNED', `unsigned int')', dnl + `(`MPI_FLOAT', `float')', dnl + `(`MPI_DOUBLE', `double')', dnl + `(`MPI_LONG_LONG_INT', `long long')', dnl + `(`MPI_UNSIGNED_LONG_LONG', `unsigned long long')', dnl + ), `SWIN(translit(dt, `()'))')dnl + DEBUG_RETURN_ERROR(NC_EBADTYPE);; + + return NC_NOERR; +} \ No newline at end of file diff --git a/src/drivers/ncchunkio/ncchkioi_get_var.c b/src/drivers/ncchunkio/ncchkioi_get_var.c new file mode 100644 index 0000000000..2b4db288b1 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_get_var.c @@ -0,0 +1,878 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +int ncchkioi_get_var_cb_chunk (NC_chk *ncchkp, + NC_chk_var *varp, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + void *buf) { + int err=NC_NOERR; + int i, j, k; + int cid; // Chunk iterator + + MPI_Offset *ostart = NULL, *osize; + int *tsize = NULL, *tssize, *tstart, *tsizep, *tssizep, *tstartp; // Size for sub-array type + MPI_Offset *citr; // Chunk iterator + + int *rcnt_local = NULL, *rcnt_all = NULL; // Number of processes that writes to each chunk + + int overlapsize; // Size of overlaping region of request and chunk + char *cbuf = NULL; // Intermediate continuous buffer + + int packoff; // Pack offset + MPI_Datatype ptype; // Pack datatype + + int nread; // # chunks to read form file + int *rids = NULL; // Id of chunks to read from file + + int nsend, nrecv; // Number of send and receive + MPI_Request *sreqs = NULL, *rreqs = NULL; // Send and recv req + MPI_Status *sstats = NULL, *rstats = NULL; // Send and recv status + char **sbufs = NULL, **rbufs = NULL; // Send and recv buffer + int *rsizes = NULL; // recv size of each message + MPI_Message rmsg; // Receive message + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_INIT) + + // Allocate buffering for write count + rcnt_local = (int *)NCI_Malloc (sizeof (int) * varp->nchunk * 2); + rcnt_all = rcnt_local + varp->nchunk; + + // Allocate buffering for overlaping index + tsize = (int *)NCI_Malloc (sizeof (int) * varp->ndim * 3); + tssize = tsize + varp->ndim; + tstart = tssize + varp->ndim; + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * 3); + osize = ostart + varp->ndim; + + // Chunk iterator + citr = osize + varp->ndim; + + // We need to calculate the size of message of each chunk + // This is just for allocating send buffer + // We do so by iterating through all request and all chunks they cover + // If we are not the owner of a chunk, we need to send message + memset (rcnt_local, 0, sizeof (int) * varp->nchunk); + nsend = 0; + + // Iterate through chunks + ncchkioi_chunk_itr_init (varp, start, count, citr, &cid); + do { + rcnt_local[cid] = 1; + + if (varp->chunk_owner[cid] != ncchkp->rank) { + // Count number of mnessage we need to send + nsend++; + } + } while (ncchkioi_chunk_itr_next (varp, start, count, citr, &cid)); + + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET_CB_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SYNC) + + // Sync number of messages of each chunk + CHK_ERR_ALLREDUCE (rcnt_local, rcnt_all, varp->nchunk, MPI_INT, MPI_SUM, ncchkp->comm); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SYNC) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO_INIT) + + // We need to prepare chunk in the chunk cache + // For chunks not yet allocated, we need to read them form file collectively + // We collect chunk id of those chunks + // Calculate number of recv request + // This is for all the chunks + rids = (int *)NCI_Malloc (sizeof (int) * varp->nmychunk); + nread = 0; + nrecv = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + // We don't need message for our own data + nrecv += rcnt_all[cid] - rcnt_local[cid]; + // Count number of chunks we need to prepare + // We read only chunks that is required + + if (rcnt_all[cid] || rcnt_local[cid]) { + if (varp->chunk_cache[cid] == NULL) { + // err = ncchkioi_cache_alloc(ncchkp, varp->chunksize, varp->chunk_cache + cid); + if (varp->chunk_index[cid].len > 0) { rids[nread++] = cid; } + } else { + // ncchkioi_cache_visit(ncchkp, varp->chunk_cache[cid]); + } + } + } + + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET_CB) // I/O time count separately + +#ifdef PNETCDF_PROFILING + MPI_Barrier (ncchkp->comm); +#endif + // Decompress chunks into chunk cache + err = ncchkioi_load_var (ncchkp, varp, nread, rids); + CHK_ERR + // Increase batch number to indicate allocated chunk buffer can be freed for future allocation + (ncchkp->cache_serial)++; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + + // Allocate buffer for send and recv + // We need to accept nrecv requests and receive nsend of replies + rreqs = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * (nrecv + nsend)); + CHK_PTR (rreqs) + rstats = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * (nrecv + nsend)); + CHK_PTR (rstats) + rbufs = (char **)NCI_Malloc (sizeof (char *) * (nrecv + nsend)); + CHK_PTR (rbufs) + rsizes = (int *)NCI_Malloc (sizeof (int) * (nrecv + nsend)); + CHK_PTR (rsizes) + // We need to send nsend requests and reply nrecv of requests + sbufs = (char **)NCI_Malloc (sizeof (char *) * (nrecv + nsend)); + CHK_PTR (sbufs) + sreqs = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * (nrecv + nsend)); + CHK_PTR (sreqs) + sstats = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * (nrecv + nsend)); + CHK_PTR (sstats) + + // Post send + k = 0; + // Initialize chunk iterator + ncchkioi_chunk_itr_init_ex (varp, start, count, citr, &cid, ostart, osize); + // Iterate through chunks + do { + // We got something to send if we are not owner + if (varp->chunk_owner[cid] != ncchkp->rank) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_PACK_REQ) + + // Calculate chunk overlap + overlapsize = varp->esize; + for (j = 0; j < varp->ndim; j++) { overlapsize *= osize[j]; } + + // Allocate buffer + sbufs[k] = (char *)NCI_Malloc (sizeof (int) * varp->ndim * 2); // For request + CHK_PTR (sbufs[k]) + rbufs[k + nrecv] = + (char *)NCI_Malloc (overlapsize); // For reply, first nrecv are for request + CHK_PTR (rbufs[k + nrecv]) + + // Metadata + tstartp = (int *)sbufs[k]; + packoff = varp->ndim * sizeof (int); + tsizep = (int *)(sbufs[k] + packoff); + packoff += varp->ndim * sizeof (int); + for (j = 0; j < varp->ndim; j++) { + tstartp[j] = (int)(ostart[j] - citr[j]); + tsizep[j] = (int)osize[j]; + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_PACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Send request + CHK_ERR_ISEND (sbufs[k], packoff, MPI_BYTE, varp->chunk_owner[cid], cid, ncchkp->comm, + sreqs + k); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REP) + + // Post recv reply + CHK_ERR_IRECV (rbufs[k + nrecv], overlapsize, MPI_BYTE, varp->chunk_owner[cid], + cid + 1024, ncchkp->comm, rreqs + nrecv + k); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REP) + + k++; + } + } while (ncchkioi_chunk_itr_next_ex (varp, start, count, citr, &cid, ostart, osize)); + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Post recv + k = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + // We are the owner of the chunk + // Receive data from other process + for (j = 0; j < rcnt_all[cid] - rcnt_local[cid]; j++) { + // Get message size, including metadata + CHK_ERR_MPROBE (MPI_ANY_SOURCE, cid, ncchkp->comm, &rmsg, rstats); + CHK_ERR_GET_COUNT (rstats, MPI_BYTE, rsizes + k); + + // Allocate buffer + rbufs[k] = (char *)NCI_Malloc (rsizes[k]); + CHK_PTR (rbufs[k]) + + // Post irecv + CHK_ERR_IMRECV (rbufs[k], rsizes[k], MPI_BYTE, &rmsg, rreqs + k); + k++; + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Allocate intermediate buffer + cbuf = (char *)NCI_Malloc (varp->chunksize); + CHK_PTR (cbuf) + + // For each chunk we own, we need to receive incoming data + k = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SELF) + + // Handle our own data first if we have any + if (rcnt_local[cid] > 0) { + // Convert chunk id to iterator + get_chunk_itr (varp, cid, citr); + + // Calculate overlapping region + get_chunk_overlap (varp, citr, start, count, ostart, osize); + + // Pack type from chunk buffer to (contiguous) intermediate buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - citr[j]); + tsize[j] = varp->chunkdim[j]; + tssize[j] = (int)osize[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into intermediate buffer + packoff = 0; + CHK_ERR_PACK (varp->chunk_cache[cid]->buf, 1, ptype, cbuf, varp->chunksize, &packoff, + ncchkp->comm); + overlapsize = packoff; + MPI_Type_free (&ptype); + + // Pack type from (contiguous) intermediate buffer to user buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - start[j]); + tsize[j] = (int)count[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into user buffer + packoff = 0; + CHK_ERR_UNPACK (cbuf, overlapsize, &packoff, buf, 1, ptype, ncchkp->comm); + MPI_Type_free (&ptype); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SELF) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Wait for all send requests related to this chunk + // We remove the impact of -1 mark in rcnt_local[cid] + // printf("Rank: %d, CHK_ERR_WAITALL_recv(%d, %d)\n", ncchkp->rank, rcnt_all[cid] - + // rcnt_local[cid], k); fflush(stdout); + CHK_ERR_WAITALL (rcnt_all[cid] - rcnt_local[cid], rreqs + k, rstats + k); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Now, it is time to process data from other processes + for (j = 0; j < varp->ndim; j++) { tsize[j] = varp->chunkdim[j]; } + + // Process data received + // printf("nrecv = %d, rcnt_all = %d, rcnt_local = %d\n", nrecv, rcnt_all[cid], + // rcnt_local[cid]); fflush(stdout); + for (j = k; j < k + rcnt_all[cid] - rcnt_local[cid]; j++) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + + // Metadata + tstartp = (int *)rbufs[j]; + packoff = varp->ndim * sizeof (int); + tssizep = (int *)(rbufs[j] + packoff); + packoff += varp->ndim * sizeof (int); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_PACK_REP) + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstartp, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Allocate buffer + MPI_Type_size (ptype, &overlapsize); + sbufs[j + nsend] = (char *)NCI_Malloc (overlapsize); // For reply + + // Data + packoff = 0; + CHK_ERR_PACK (varp->chunk_cache[cid]->buf, 1, ptype, sbufs[j + nsend], overlapsize, + &packoff, ncchkp->comm); + MPI_Type_free (&ptype); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_PACK_REP) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REP) + + // Send reply + CHK_ERR_ISEND (sbufs[j + nsend], packoff, MPI_BYTE, rstats[j].MPI_SOURCE, cid + 1024, + ncchkp->comm, sreqs + j + nsend); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REP) + } + k += rcnt_all[cid] - rcnt_local[cid]; + + // princbuf(ncchkp->rank, varp->chunk_cache[cid], varp->chunksize); + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Wait for all send request + // printf("Rank: %d, CHK_ERR_WAITALL_send(%d, %d)\n", ncchkp->rank, nsend, 0); fflush(stdout); + CHK_ERR_WAITALL (nsend, sreqs, sstats); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Receive replies from the owners and update the user buffer + k = 0; + // Initialize chunk iterator + ncchkioi_chunk_itr_init_ex (varp, start, count, citr, &cid, ostart, osize); + // Iterate through chunks + do { + // We got something to recv if we are not owner + if (varp->chunk_owner[cid] != ncchkp->rank) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REP) + + // Pack type from recv buffer to user buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - start[j]); + tsize[j] = (int)count[j]; + tssize[j] = (int)osize[j]; + } + // printf("Rank: %d, ostart=[%lld, %lld], osize=[%lld, %lld]\n", ncchkp->rank, + // ostart[0], ostart[1], osize[0], osize[1]); fflush(stdout); printf("Rank: %d, + // CHK_ERR_TYPE_CREATE_SUBARRAY4([%d, %d], [%d, %d], [%d, %d]\n", ncchkp->rank, + // tsize[0], tsize[1], tssize[0], tssize[1], tstart[0], tstart[1]); fflush(stdout); + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + // printf("Rank: %d, commit\n", ncchkp->rank); fflush(stdout); + CHK_ERR_TYPE_COMMIT (&ptype); + MPI_Type_size (ptype, &overlapsize); + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REP) + + // printf("Rank: %d, wait recv, nrecv = %d, k = %d, nsend = %d\n", ncchkp->rank, nrecv, + // k, nsend); fflush(stdout); + // Wait for reply + // printf("Rank: %d, MPI_Wait_recv(%d)\n", ncchkp->rank, nrecv + k); fflush(stdout); + MPI_Wait (rreqs + nrecv + k, rstats + nrecv + k); + + NC_CHK_TIMER_STOPEX (NC_CHK_TIMER_GET_CB_RECV_REP, NC_CHK_TIMER_GET_CB_UNPACK_REP) + + // Pack data + // printf("Rank: %d, pack\n", ncchkp->rank); fflush(stdout); + packoff = 0; + CHK_ERR_UNPACK (rbufs[nrecv + k], overlapsize, &packoff, buf, 1, ptype, ncchkp->comm); + MPI_Type_free (&ptype); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REP) + + k++; + } + } while (ncchkioi_chunk_itr_next_ex (varp, start, count, citr, &cid, ostart, osize)); + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REP) + + // printf("Rank: %d, wait_final\n", ncchkp->rank); fflush(stdout); + // Wait for all send replies + // printf("Rank: %d, CHK_ERR_WAITALL_send(%d, %d)\n", ncchkp->rank, nrecv, nsend); + // fflush(stdout); + CHK_ERR_WAITALL (nrecv, sreqs + nsend, sstats + nsend); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REP) + + // printf("Rank: %d, exiting\n", ncchkp->rank); fflush(stdout); + +err_out:; + + // Free buffers + NCI_Free (rcnt_local); + + NCI_Free (rids); + + NCI_Free (tsize); + + NCI_Free (ostart); + + for (i = 0; i < nsend + nrecv; i++) { + NCI_Free (sbufs[i]); + NCI_Free (rbufs[i]); + } + NCI_Free (sreqs); + NCI_Free (sstats); + NCI_Free (sbufs); + NCI_Free (rreqs); + NCI_Free (rstats); + NCI_Free (rbufs); + NCI_Free (rsizes); + + if (cbuf != NULL) { NCI_Free (cbuf); } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REP) + + return err; +} + +int ncchkioi_get_var_cb_proc (NC_chk *ncchkp, + NC_chk_var *varp, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + void *buf) { + int err=NC_NOERR; + int i, j, k; + int cid, cown; // Chunk iterator + + MPI_Offset *ostart= NULL , *osize; + int *tsize = NULL, *tssize, *tstart, *tssizep, *tstartp; // Size for sub-array type + MPI_Offset *citr; // Chunk iterator + + int *rcnt_local = NULL, *rcnt_all; // Number of processes that writes to each proc + + int rrange_local[2], rrange_all[2]; // Number of processes that writes to each chunk + + int overlapsize; // Size of overlaping region of request and chunk + char *tbuf = NULL; // Intermediate buffer + + int packoff; // Pack offset + MPI_Datatype ptype; // Pack datatype + + int nread; // # chunks to read form file + int *rids = NULL; // Id of chunks to read from file + + int nsend, nrecv; // Number of send and receive + MPI_Request *sreq = NULL, *rreq, *sreq_re, *rreq_re; // Send and recv req + MPI_Status *sstat = NULL, rstat, *sstat_re; // Send and recv status + char **sbuf = NULL, **rbuf, **sbufp, **rbufp, **sbuf_re, **rbuf_re; // Send and recv buffer + int *rsize, *ssize = NULL, *rsize_re, *ssize_re; // recv size of each message + int *sdst = NULL; // recv size of each message + int *smap = NULL; + MPI_Message rmsg; // Receive message + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_INIT) + + // Allocate buffering for write count + rcnt_local = (int *)NCI_Malloc (sizeof (int) * (ncchkp->np * 2 + varp->nchunk * 1)); + CHK_PTR(rcnt_local) + rcnt_all = rcnt_local + ncchkp->np; + smap = rcnt_all + ncchkp->np; + + // Allocate buffering for overlaping index + tsize = (int *)NCI_Malloc (sizeof (int) * varp->ndim * 3); + CHK_PTR(tsize) + tssize = tsize + varp->ndim; + tstart = tssize + varp->ndim; + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * 3); + CHK_PTR(ostart) + osize = ostart + varp->ndim; + + // Chunk iterator + citr = osize + varp->ndim; + + // We need to calculate the size of message of each chunk + // This is just for allocating send buffer + // We do so by iterating through all request and all chunks they cover + // If we are not the owner of a chunk, we need to send message + memset (rcnt_local, 0, sizeof (int) * (ncchkp->np + varp->nchunk)); + nsend = 0; + + // Count total number of messages and build a map of accessed chunk to list of comm + // datastructure + rrange_local[0] = varp->nchunk; + rrange_local[1] = 0; + ncchkioi_chunk_itr_init (varp, start, count, citr, &cid); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + + // Mapping to skip list of send requests + if (rcnt_local[cown] == 0 && cown != ncchkp->rank) { smap[cown] = nsend++; } + rcnt_local[cown] = 1; // Need to send message if not owner + + // Record lowest and highest chunk accessed + if (rrange_local[0] > cid) { rrange_local[0] = cid; } + if (rrange_local[1] < cid) { rrange_local[1] = cid; } + } while (ncchkioi_chunk_itr_next (varp, start, count, citr, &cid)); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SYNC) + + // Sync number of messages of each chunk and access range + CHK_ERR_ALLREDUCE (rcnt_local, rcnt_all, ncchkp->np, MPI_INT, MPI_SUM, ncchkp->comm); + nrecv = rcnt_all[ncchkp->rank] - + rcnt_local[ncchkp->rank]; // We don't need to receive request form self + + rrange_local[1] *= -1; + CHK_ERR_ALLREDUCE (rrange_local, rrange_all, 2, MPI_INT, MPI_MIN, ncchkp->comm); + rrange_all[1] *= -1; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SYNC) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_PACK_REQ) + + // Allocate data structure for messaging + sbuf = (char **)NCI_Malloc (sizeof (char *) * (2 * nsend + nrecv)); + CHK_PTR (sbuf) + ssize = (int *)NCI_Malloc (sizeof (int) * (nsend * 2 + nrecv * 1)); + CHK_PTR (ssize) + sdst = ssize + (nsend + nrecv); + sreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * (nsend + nrecv)); + CHK_PTR (sreq) + sstat = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * (nsend + nrecv)); + CHK_PTR (sstat) + + rbuf = (char **)NCI_Malloc (sizeof (char *) * (nsend + nrecv * 2)); + CHK_PTR (rbuf) + rsize = (int *)NCI_Malloc (sizeof (int) * (nsend + nrecv)); + CHK_PTR (rsize) + rreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * (nsend + nrecv)); + CHK_PTR (rreq) + + sbuf_re = sbuf + nsend; + sbufp = sbuf_re + nrecv; + ssize_re = ssize + nsend; + sreq_re = sreq + nsend; + sstat_re = sstat + nsend; + + rbuf_re = rbuf + nrecv; + rbufp = rbuf_re + nsend; + rsize_re = rsize + nrecv; + rreq_re = rreq + nrecv; + + // Count size of each request + memset (ssize, 0, sizeof (int) * nsend); + memset (rsize_re, 0, sizeof (int) * nsend); + ncchkioi_chunk_itr_init_ex (varp, start, count, citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + sdst[j] = cown; // Record a reverse map by the way + + // Count overlap + overlapsize = varp->esize; + for (i = 0; i < varp->ndim; i++) { overlapsize *= osize[i]; } + ssize[j] += sizeof (int) * (varp->ndim * 2 + 1); + rsize_re[j] += overlapsize; + } + } while (ncchkioi_chunk_itr_next_ex (varp, start, count, citr, &cid, ostart, osize)); + + // Allocate buffer for send + for (i = 0; i < nsend; i++) { + ssize[i] += sizeof (int); + sbuf[i] = sbufp[i] = (char *)NCI_Malloc (ssize[i]); + CHK_PTR (sbuf[i]) + *((int *)sbufp[i]) = rsize_re[i]; + sbufp[i] += sizeof (int); + rbuf_re[i] = (char *)NCI_Malloc (rsize_re[i]); + CHK_PTR (rbuf_re[i]) + } + + // Pack requests + ncchkioi_chunk_itr_init_ex (varp, start, count, citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + + // Metadata + *((int *)sbufp[j]) = cid; + sbufp[j] += sizeof (int); + tstartp = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + tssizep = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + for (i = 0; i < varp->ndim; i++) { + tstartp[i] = (int)(ostart[i] - citr[i]); + tssizep[i] = (int)osize[i]; + } + } + } while (ncchkioi_chunk_itr_next_ex (varp, start, count, citr, &cid, ostart, osize)); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_PACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Post send + for (i = 0; i < nsend; i++) { + CHK_ERR_ISEND (sbuf[i], ssize[i], MPI_BYTE, sdst[i], 0, ncchkp->comm, sreq + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REP) + + // Post receive + for (i = 0; i < nsend; i++) { + CHK_ERR_IRECV (rbuf_re[i], rsize_re[i], MPI_BYTE, sdst[i], 1, ncchkp->comm, rreq_re + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REP) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Post recv + for (i = 0; i < nrecv; i++) { + // Get message size, including metadata + CHK_ERR_MPROBE (MPI_ANY_SOURCE, 0, ncchkp->comm, &rmsg, &rstat); + CHK_ERR_GET_COUNT (&rstat, MPI_BYTE, rsize + i); + + // Allocate buffer + rbuf[i] = rbufp[i] = (char *)NCI_Malloc (rsize[i]); + CHK_PTR (rbuf[i]) + + // Post irecv + CHK_ERR_IMRECV (rbuf[i], rsize[i], MPI_BYTE, &rmsg, rreq + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REQ) + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO_INIT) + + // We need to prepare chunk in the chunk cache + // For chunks not yet allocated, we need to read them from file collectively + // We collect chunk id of those chunks + // Calculate number of recv request + // This is for all the chunks + for (j = 0; j < varp->nmychunk && varp->mychunks[j] < rrange_all[0]; j++) + ; + for (k = j; k < varp->nmychunk && varp->mychunks[k] <= rrange_all[1]; k++) + ; + rids = (int *)NCI_Malloc (sizeof (int) * (k - j)); + nread = 0; + for (i = j; i < k; i++) { + cid = varp->mychunks[i]; + // printf("checking chunk %d, size is %d\n",cid, varp->chunk_index[cid].len); + if (varp->chunk_cache[cid] == NULL) { + // err = ncchkioi_cache_alloc(ncchkp, varp->chunksize, varp->chunk_cache + cid); + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + if (varp->chunk_index[cid].len > 0) { rids[nread++] = cid; /* printf("chunk %d need read\n",cid); */ } + } else { + // ncchkioi_cache_visit(ncchkp, varp->chunk_cache[cid]); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_IO_INIT) + + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET_CB) // I/O time count separately + +#ifdef PNETCDF_PROFILING + MPI_Barrier (ncchkp->comm); +#endif + // Decompress chunks into chunk cache + err = ncchkioi_load_var (ncchkp, varp, nread, rids); + CHK_ERR + // Increase batch number to indicate allocated chunk buffer can be freed for future allocation + (ncchkp->cache_serial)++; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SELF) + + tbuf = (char *)NCI_Malloc (varp->chunksize); + + // Handle our own data + ncchkioi_chunk_itr_init_ex (varp, start, count, citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + if (varp->chunk_owner[cid] == ncchkp->rank) { + // Pack type from chunk cache to (contiguous) intermediate buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - citr[j]); + tsize[j] = varp->chunkdim[j]; + tssize[j] = (int)osize[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into intermediate buffer + packoff = 0; + CHK_ERR_PACK (varp->chunk_cache[cid]->buf, 1, ptype, tbuf, varp->chunksize, &packoff, + ncchkp->comm); + MPI_Type_free (&ptype); + overlapsize = packoff; + + // Pack type from (contiguous) intermediate buffer to chunk buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - start[j]); + tsize[j] = (int)count[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Unpack data into chunk buffer + packoff = 0; + CHK_ERR_UNPACK (tbuf, overlapsize, &packoff, buf, 1, ptype, ncchkp->comm); + MPI_Type_free (&ptype); + } + } while (ncchkioi_chunk_itr_next_ex (varp, start, count, citr, &cid, ostart, osize)); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SELF) + + // Handle incoming requests + for (i = 0; i < varp->ndim; i++) { tsize[i] = varp->chunkdim[i]; } + for (i = 0; i < nrecv; i++) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Will wait any provide any benefit? + MPI_Waitany (nrecv, rreq, &j, &rstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REQ) + + packoff = 0; + ssize_re[j] = *((int *)rbufp[j]); + rbufp[j] += sizeof (int); + sbuf_re[j] = (char *)NCI_Malloc (ssize_re[j]); + CHK_PTR (sbuf_re[j]) + while (rbufp[j] < rbuf[j] + rsize[j]) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + + // Metadata + cid = *((int *)rbufp[j]); + rbufp[j] += sizeof (int); + tstartp = (int *)rbufp[j]; + rbufp[j] += sizeof (int) * varp->ndim; + tssizep = (int *)rbufp[j]; + rbufp[j] += sizeof (int) * varp->ndim; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_PACK_REP) + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstartp, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Data + CHK_ERR_PACK (varp->chunk_cache[cid]->buf, 1, ptype, sbuf_re[j], ssize_re[j], &packoff, + ncchkp->comm); + MPI_Type_free (&ptype); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_PACK_REP) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Send Response + CHK_ERR_ISEND (sbuf_re[j], packoff, MPI_BYTE, rstat.MPI_SOURCE, 1, ncchkp->comm, + sreq_re + j); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Wait for all request + CHK_ERR_WAITALL (nsend, sreq, sstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Handle reply + for (i = 0; i < varp->ndim; i++) { tsize[i] = count[i]; } + for (i = 0; i < nsend; i++) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REP) + + // Will wait any provide any benefit? + MPI_Waitany (nsend, rreq_re, &j, &rstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REP) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REP) + + sbufp[j] = sbuf[j] + sizeof (int); // Skip reply size + packoff = 0; + while (packoff < rsize_re[j]) { + // Retrieve metadata from the request we sent + cid = *((int *)sbufp[j]); + sbufp[j] += sizeof (int); + tstartp = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + tssizep = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + + // Bring back the request + get_chunk_itr (varp, cid, citr); + for (k = 0; k < varp->ndim; k++) { tstartp[k] += (int)(citr[k] - start[k]); } + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstartp, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data + CHK_ERR_UNPACK (rbuf_re[j], rsize_re[j], &packoff, buf, 1, ptype, ncchkp->comm); + MPI_Type_free (&ptype); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REP) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REP) + + // Wait for all Response + CHK_ERR_WAITALL (nrecv, sreq_re, sstat_re); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REP) + +err_out:; + + // Free buffers + NCI_Free (rcnt_local); + + NCI_Free (rids); + + NCI_Free (tsize); + + NCI_Free (ostart); + + NCI_Free (sreq); + NCI_Free (sstat); + NCI_Free (ssize); + for (i = 0; i < nsend + nrecv; i++) { + NCI_Free (sbuf[i]); + NCI_Free (rbuf[i]); + } + NCI_Free (sbuf); + + NCI_Free (rreq); + NCI_Free (rbuf); + NCI_Free (rsize); + + if (tbuf != NULL) { NCI_Free (tbuf); } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB) + + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_get_varn.c b/src/drivers/ncchunkio/ncchkioi_get_varn.c new file mode 100644 index 0000000000..40ae859ae9 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_get_varn.c @@ -0,0 +1,942 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +int ncchkioi_get_varn_cb_chunk (NC_chk *ncchkp, + NC_chk_var *varp, + int nreq, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + MPI_Offset *const *strides, + void **bufs) { + int err=NC_NOERR; + int i, j, k, l; + int cid, req; // Chunk iterator + + MPI_Offset *ostart, *osize; + int *tsize, *tssize, *tstart, *tsizep, *tssizep, *tstartp; // Size for sub-array type + MPI_Offset *citr; // Chunk iterator + + int *rcnt_local, *rcnt_all; // Number of processes that writes to each chunk + + int overlapsize; // Size of overlaping region of request and chunk + int overlapcnt; + char *cbuf = NULL; // Intermediate continuous buffer + + int packoff, unpackoff; // Pack offset + MPI_Datatype ptype; // Pack datatype + + int nread; // # chunks to read form file + int *rids; // Id of chunks to read from file + + int nsend, nrecv; // Number of send and receive + MPI_Request *sreqs, *rreqs; // Send and recv req + MPI_Status *sstats, *rstats; // Send and recv status + char **sbufs, **rbufs; // Send and recv buffer + int *rsizes; // recv size of each message + MPI_Message rmsg; // Receive message + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_INIT) + + // Allocate buffering for write count + rcnt_local = (int *)NCI_Malloc (sizeof (int) * varp->nchunk * 2); + rcnt_all = rcnt_local + varp->nchunk; + + // Allocate buffering for overlaping index + tsize = (int *)NCI_Malloc (sizeof (int) * varp->ndim * 3); + tssize = tsize + varp->ndim; + tstart = tssize + varp->ndim; + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * 3); + osize = ostart + varp->ndim; + + // Chunk iterator + citr = osize + varp->ndim; + + // We need to calculate the size of message of each chunk + // This is just for allocating send buffer + // We do so by iterating through all request and all chunks they cover + // If we are not the owner of a chunk, we need to send message + memset (rcnt_local, 0, sizeof (int) * varp->nchunk); + nsend = 0; + for (req = 0; req < nreq; req++) { + // Iterate through chunks + ncchkioi_chunk_itr_init (varp, starts[req], counts[req], citr, &cid); + do { + if (varp->chunk_owner[cid] != ncchkp->rank && rcnt_local[cid] == 0) { + // Count number of mnessage we need to send + nsend++; + } + + rcnt_local[cid] = 1; + } while (ncchkioi_chunk_itr_next (varp, starts[req], counts[req], citr, &cid)); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SYNC) + + // Sync number of messages of each chunk + CHK_ERR_ALLREDUCE (rcnt_local, rcnt_all, varp->nchunk, MPI_INT, MPI_SUM, ncchkp->comm); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SYNC) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO_INIT) + + // We need to prepare chunk in the chunk cache + // For chunks not yet allocated, we need to read them form file collectively + // We collect chunk id of those chunks + // Calculate number of recv request + // This is for all the chunks + rids = (int *)NCI_Malloc (sizeof (int) * varp->nmychunk); + nread = 0; + nrecv = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + // We don't need message for our own data + nrecv += rcnt_all[cid] - rcnt_local[cid]; + // Count number of chunks we need to prepare + // We read only chunks that is required + if (rcnt_all[cid] || rcnt_local[cid]) { + if (varp->chunk_cache[cid] == NULL) { + // err = ncchkioi_cache_alloc(ncchkp, varp->chunksize, varp->chunk_cache + cid); + // varp->chunk_cache[cid] = (NC_chk_cache*)NCI_Malloc(varp->chunksize); + if (varp->chunk_index[cid].len > 0) { rids[nread++] = cid; } + } else { + // ncchkioi_cache_visit(ncchkp, varp->chunk_cache[cid]); + } + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_IO_INIT) + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET_CB) // I/O time count separately + +#ifdef PNETCDF_PROFILING + MPI_Barrier (ncchkp->comm); +#endif + // Decompress chunks into chunk cache + err = ncchkioi_load_var (ncchkp, varp, nread, rids); + CHK_ERR + + // Increase batch number to indicate allocated chunk buffer can be freed for future allocation + (ncchkp->cache_serial)++; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + + // Allocate buffer for send and recv + // We need to accept nrecv requests and receive nsend of replies + rreqs = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * (nrecv + nsend)); + rstats = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * (nrecv + nsend)); + rbufs = (char **)NCI_Malloc (sizeof (char *) * (nrecv + nsend)); + rsizes = (int *)NCI_Malloc (sizeof (int) * (nrecv + nsend)); + // We need to send nsend requests and reply nrecv of requests + sbufs = (char **)NCI_Malloc (sizeof (char *) * (nrecv + nsend)); + sreqs = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * (nrecv + nsend)); + sstats = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * (nrecv + nsend)); + + // Post send + k = l = 0; + for (cid = 0; cid < varp->nchunk; cid++) { + if (varp->chunk_owner[cid] == ncchkp->rank) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // We are the owner of the chunk + // Receive data from other process + for (j = 0; j < rcnt_all[cid] - rcnt_local[cid]; j++) { + // Get message size, including metadata + CHK_ERR_MPROBE (MPI_ANY_SOURCE, cid, ncchkp->comm, &rmsg, rstats); + CHK_ERR_GET_COUNT (rstats, MPI_BYTE, rsizes + k); + + // Allocate buffer + rbufs[k] = (char *)NCI_Malloc (rsizes[k]); + + // Post irecv + CHK_ERR_IMRECV (rbufs[k], rsizes[k], MPI_BYTE, &rmsg, rreqs + k); + k++; + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REQ) + } else { + // We have some request to send + if (rcnt_local[cid] > 0) { + get_chunk_itr (varp, cid, citr); + rsizes[nrecv + l] = overlapcnt = 0; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_PACK_REQ) + + // Calculate send buffer size + for (req = 0; req < nreq; req++) { + // Calculate chunk overlap + overlapsize = + get_chunk_overlap (varp, citr, starts[req], counts[req], ostart, osize); + + rsizes[nrecv + l] += overlapsize; + + if (overlapsize > 0) { overlapcnt++; } + } + + // Allocate buffer + // Faster to request the entire chunk + if (rsizes[nrecv + l] >= varp->chunksize) { + rsizes[nrecv + l] = varp->chunksize; + overlapcnt = 1; + } + sbufs[l] = (char *)NCI_Malloc (sizeof (int) * (overlapcnt * varp->ndim * 2) + 1); + rbufs[nrecv + l] = (char *)NCI_Malloc (rsizes[nrecv + l]); + + // Metadata + *((int *)sbufs[l]) = rsizes[nrecv + l]; + packoff = sizeof (int); + if (rsizes[nrecv + l] == + varp->chunksize) { // Request the entire chunk directly if need more than that + tstartp = (int *)(sbufs[l] + packoff); + packoff += varp->ndim * sizeof (int); + tsizep = (int *)(sbufs[l] + packoff); + packoff += varp->ndim * sizeof (int); + memset (tstartp, 0, sizeof (int) * varp->ndim); + memcpy (tsizep, varp->chunkdim, sizeof (int) * varp->ndim); + } else { + for (req = 0; req < nreq; req++) { + // Calculate chunk overlap + overlapsize = + get_chunk_overlap (varp, citr, starts[req], counts[req], ostart, osize); + + if (overlapsize > 0) { + tstartp = (int *)(sbufs[l] + packoff); + packoff += varp->ndim * sizeof (int); + tsizep = (int *)(sbufs[l] + packoff); + packoff += varp->ndim * sizeof (int); + // Metadata + for (j = 0; j < varp->ndim; j++) { + tstartp[j] = (int)(ostart[j] - citr[j]); + tsizep[j] = (int)osize[j]; + } + } + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_PACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Send request + CHK_ERR_ISEND (sbufs[l], packoff, MPI_BYTE, varp->chunk_owner[cid], cid, + ncchkp->comm, sreqs + l); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REP) + + // printf("Rank: %d, CHK_ERR_IRECV(%d, %d, %d, %d)\n", ncchkp->rank, overlapsize, + // varp->chunk_owner[cid], cid + 1024, nrecv + k); fflush(stdout); + CHK_ERR_IRECV (rbufs[l + nrecv], rsizes[nrecv + l], MPI_BYTE, + varp->chunk_owner[cid], cid + 1024, ncchkp->comm, rreqs + nrecv + l); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REP) + + l++; + } + } + } + + // Allocate intermediate buffer + cbuf = (char *)NCI_Malloc (varp->chunksize); + + // For each chunk we own, we need to reply to incoming reqeust + k = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SELF) + + // Handle our own data first if we have any + if (rcnt_local[cid] > 0) { + // Convert chunk id to iterator + get_chunk_itr (varp, cid, citr); + + for (req = 0; req < nreq; req++) { + // Calculate overlapping region + overlapsize = + get_chunk_overlap (varp, citr, starts[req], counts[req], ostart, osize); + + if (overlapsize > 0) { + // Pack type from chunk buffer to (contiguous) intermediate buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - citr[j]); + tsize[j] = varp->chunkdim[j]; + tssize[j] = (int)osize[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into intermediate buffer + packoff = 0; + CHK_ERR_PACK (varp->chunk_cache[cid]->buf, 1, ptype, cbuf, varp->chunksize, + &packoff, ncchkp->comm); + overlapsize = packoff; + MPI_Type_free (&ptype); + + // Pack type from (contiguous) intermediate buffer to user buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - starts[req][j]); + tsize[j] = (int)counts[req][j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into user buffer + packoff = 0; + CHK_ERR_UNPACK (cbuf, overlapsize, &packoff, bufs[req], 1, ptype, ncchkp->comm); + MPI_Type_free (&ptype); + } + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SELF) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Wait for all send requests related to this chunk + // We remove the impact of -1 mark in rcnt_local[cid] + CHK_ERR_WAITALL (rcnt_all[cid] - rcnt_local[cid], rreqs + k, rstats + k); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Now, it is time to process data from other processes + for (j = 0; j < varp->ndim; j++) { tsize[j] = varp->chunkdim[j]; } + // Process data received + for (j = k; j < k + rcnt_all[cid] - rcnt_local[cid]; j++) { + packoff = 0; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + + // Allocate buffer + overlapsize = *((int *)rbufs[j]); + unpackoff = sizeof (int); + sbufs[j + nsend] = (char *)NCI_Malloc (overlapsize); // For reply + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + + // Pack data + while (unpackoff < rsizes[j]) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + + // Get metadata + tstartp = (int *)(rbufs[j] + unpackoff); + unpackoff += varp->ndim * sizeof (int); + tssizep = (int *)(rbufs[j] + unpackoff); + unpackoff += varp->ndim * sizeof (int); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_PACK_REP) + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstartp, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data + CHK_ERR_PACK (varp->chunk_cache[cid]->buf, 1, ptype, sbufs[j + nsend], overlapsize, + &packoff, ncchkp->comm); + MPI_Type_free (&ptype); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_PACK_REP) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REP) + + // Send reply + // printf("Rank: %d, CHK_ERR_ISEND(%d, %d, %d, %d)\n", ncchkp->rank, packoff, + // varp->chunk_owner[cid], cid + 1024, k + nsend); fflush(stdout); + CHK_ERR_ISEND (sbufs[j + nsend], packoff, MPI_BYTE, rstats[j].MPI_SOURCE, cid + 1024, + ncchkp->comm, sreqs + j + nsend); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REP) + } + k += rcnt_all[cid] - rcnt_local[cid]; + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Wait for all request sent + // printf("Rank: %d, CHK_ERR_WAITALL_send(%d, %d)\n", ncchkp->rank, nsend, 0); fflush(stdout); + CHK_ERR_WAITALL (nsend, sreqs, sstats); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Receive replies from the owners and update the user buffer + k = 0; + for (cid = 0; cid < varp->nchunk; cid++) { + if (rcnt_local[cid] > 0 && varp->chunk_owner[cid] != ncchkp->rank) { + get_chunk_itr (varp, cid, citr); + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REP) + + // Wait for reply + // printf("Rank: %d, MPI_Wait_recv(%d)\n", ncchkp->rank, nrecv + k); fflush(stdout); + MPI_Wait (rreqs + nrecv + k, rstats + nrecv + k); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REP) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REP) + + packoff = 0; + for (req = 0; req < nreq; req++) { + // Calculate chunk overlap + overlapsize = + get_chunk_overlap (varp, citr, starts[req], counts[req], ostart, osize); + + if (overlapsize > 0) { + // Pack type from recv buffer to user buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - starts[req][j]); + tsize[j] = (int)counts[req][j]; + tssize[j] = (int)osize[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data + CHK_ERR_UNPACK (rbufs[nrecv + k], rsizes[nrecv + k], &packoff, bufs[req], 1, + ptype, ncchkp->comm); + MPI_Type_free (&ptype); + } + } + k++; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REP) + } + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REP) + + // Wait for all send replies + CHK_ERR_WAITALL (nrecv, sreqs + nsend, sstats + nsend); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REP) + +err_out:; + + // Free buffers + NCI_Free (rcnt_local); + + NCI_Free (rids); + + NCI_Free (tsize); + + NCI_Free (ostart); + + for (i = 0; i < nsend + nrecv; i++) { + NCI_Free (sbufs[i]); + NCI_Free (rbufs[i]); + } + NCI_Free (sreqs); + NCI_Free (sstats); + NCI_Free (sbufs); + NCI_Free (rreqs); + NCI_Free (rstats); + NCI_Free (rbufs); + NCI_Free (rsizes); + + if (cbuf != NULL) { NCI_Free (cbuf); } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB) + + return err; +} + +int ncchkioi_get_varn_cb_proc (NC_chk *ncchkp, + NC_chk_var *varp, + int nreq, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + void **bufs) { + int err=NC_NOERR; + int i, j, k; + int cid, cown; // Chunk iterator + int req, **reqs; + + MPI_Offset *ostart, *osize; + int *tsize, *tssize, *tstart, *tssizep, *tstartp; // Size for sub-array type + MPI_Offset *citr; // Chunk iterator + + int *rcnt_local, *rcnt_all; // Number of processes that writes to each proc + + int rrange_local[2], rrange_all[2]; // Number of processes that writes to each chunk + + int overlapsize; // Size of overlaping region of request and chunk + char *tbuf = NULL; // Intermediate buffer + + int packoff; // Pack offset + MPI_Datatype ptype; // Pack datatype + + int nread; // # chunks to read form file + int *rids; // Id of chunks to read from file + + int nsend, nrecv; // Number of send and receive + MPI_Request *sreq, *rreq, *sreq_re, *rreq_re; // Send and recv req + MPI_Status *sstat, rstat, *sstat_re; // Send and recv status + char **sbuf, **rbuf, **sbufp, **rbufp, **sbuf_re, **rbuf_re; // Send and recv buffer + int *rsize, *ssize, *rsize_re, *ssize_re; // recv size of each message + int *sdst; // recv size of each message + int *smap; + MPI_Message rmsg; // Receive message + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_INIT) + + // Allocate buffering for write count + rcnt_local = (int *)NCI_Malloc (sizeof (int) * (ncchkp->np * 2 + varp->nchunk * 1)); + rcnt_all = rcnt_local + ncchkp->np; + smap = rcnt_all + ncchkp->np; + + // Allocate buffering for overlaping index + tsize = (int *)NCI_Malloc (sizeof (int) * varp->ndim * 3); + tssize = tsize + varp->ndim; + tstart = tssize + varp->ndim; + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * 3); + osize = ostart + varp->ndim; + + // Chunk iterator + citr = osize + varp->ndim; + + // We need to calculate the size of message of each chunk + // This is just for allocating send buffer + // We do so by iterating through all request and all chunks they cover + // If we are not the owner of a chunk, we need to send message + memset (rcnt_local, 0, sizeof (int) * (ncchkp->np + varp->nchunk)); + nsend = 0; + + // counts[req] total number of messages and build a map of accessed chunk to list of comm + // datastructure + for (req = 0; req < nreq; req++) { + ncchkioi_chunk_itr_init (varp, starts[req], counts[req], citr, + &cid); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + + // Mapping to skip list of send requests + if (rcnt_local[cown] == 0 && cown != ncchkp->rank) { smap[cown] = nsend++; } + rcnt_local[cown] = 1; // Need to send message if not owner + + // Record lowest and highest chunk accessed + if (rrange_local[0] > cid) { rrange_local[0] = cid; } + if (rrange_local[1] < cid) { rrange_local[1] = cid; } + } while (ncchkioi_chunk_itr_next (varp, starts[req], counts[req], citr, &cid)); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SYNC) + + // Sync number of messages of each chunk + CHK_ERR_ALLREDUCE (rcnt_local, rcnt_all, ncchkp->np, MPI_INT, MPI_SUM, ncchkp->comm); + nrecv = rcnt_all[ncchkp->rank] - + rcnt_local[ncchkp->rank]; // We don't need to receive request form self + + rrange_local[1] *= -1; + CHK_ERR_ALLREDUCE (rrange_local, rrange_all, 2, MPI_INT, MPI_MIN, ncchkp->comm); + rrange_all[1] *= -1; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SYNC) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO_INIT) + + // We need to prepare chunk in the chunk cache + // For chunks not yet allocated, we need to read them form file collectively + // We collect chunk id of those chunks + // Calculate number of recv request + // This is for all the chunks + for (j = 0; j < varp->nmychunk && varp->mychunks[j] < rrange_all[0]; j++) + ; + for (k = j; k < varp->nmychunk && varp->mychunks[k] <= rrange_all[1]; k++) + ; + rids = (int *)NCI_Malloc (sizeof (int) * (k - j)); + nread = 0; + for (i = j; i < k; i++) { + cid = varp->mychunks[i]; + if (varp->chunk_cache[cid] == NULL) { + // err = ncchkioi_cache_alloc(ncchkp, varp->chunksize, varp->chunk_cache + cid); + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + if (varp->chunk_index[cid].len > 0) { rids[nread++] = cid; } + } else { + // ncchkioi_cache_visit(ncchkp, varp->chunk_cache[cid]); + } + } + + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET_CB) // I/O time count separately + +#ifdef PNETCDF_PROFILING + MPI_Barrier (ncchkp->comm); +#endif + // Decompress chunks into chunk cache + ncchkioi_load_var (ncchkp, varp, nread, rids); + // Increase batch number to indicate allocated chunk buffer can be freed for future allocation + (ncchkp->cache_serial)++; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_PACK_REQ) + + // Allocate data structure for messaging + sbuf = (char **)NCI_Malloc (sizeof (char *) * (nsend * 2 + nrecv)); + ssize = (int *)NCI_Malloc (sizeof (int) * (nsend * 2 + nrecv * 1)); + sdst = ssize + (nsend + nrecv); + sreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * (nsend + nrecv)); + sstat = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * (nsend + nrecv)); + reqs = (int **)NCI_Malloc (sizeof (int *) * nsend); + + rbuf = (char **)NCI_Malloc (sizeof (char *) * (nsend + nrecv * 2)); + rsize = (int *)NCI_Malloc (sizeof (int) * (nsend + nrecv)); + rreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * (nsend + nrecv)); + + sbuf_re = sbuf + nsend; + sbufp = sbuf_re + nrecv; + ssize_re = ssize + nsend; + sreq_re = sreq + nsend; + sstat_re = sstat + nsend; + + rbuf_re = rbuf + nrecv; + rbufp = rbuf_re + nsend; + rsize_re = rsize + nrecv; + rreq_re = rreq + nrecv; + + // counts[req] size of each request + memset (ssize, 0, sizeof (int) * nsend); + memset (rsize_re, 0, sizeof (int) * nsend); + memset (rcnt_local, 0, sizeof (int) * nsend); + for (req = 0; req < nreq; req++) { + ncchkioi_chunk_itr_init_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + sdst[j] = cown; // Record a reverse map by the way + + // counts[req] overlap + overlapsize = varp->esize; + for (i = 0; i < varp->ndim; i++) { overlapsize *= osize[i]; } + ssize[j] += sizeof (int) * (varp->ndim * 2 + 1); + rsize_re[j] += overlapsize; + rcnt_local[j]++; + } + } while ( + ncchkioi_chunk_itr_next_ex (varp, starts[req], counts[req], citr, &cid, ostart, osize)); + } + + // Allocate buffer for send + for (i = 0; i < nsend; i++) { + ssize[i] += sizeof (int); + sbuf[i] = sbufp[i] = (char *)NCI_Malloc (ssize[i]); + *((int *)sbufp[i]) = rsize_re[i]; + sbufp[i] += sizeof (int); + rbuf_re[i] = (char *)NCI_Malloc (rsize_re[i]); + reqs[i] = (int *)NCI_Malloc (sizeof (int) * rcnt_local[i]); + } + + // Pack requests + memset (rcnt_local, 0, sizeof (int) * nsend); + for (req = 0; req < nreq; req++) { + ncchkioi_chunk_itr_init_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + + // Metadata + *((int *)sbufp[j]) = cid; + sbufp[j] += sizeof (int); + tstartp = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + tssizep = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + for (i = 0; i < varp->ndim; i++) { + tstartp[i] = (int)(ostart[i] - citr[i]); + tssizep[i] = (int)osize[i]; + } + + // Record source of the request + reqs[j][rcnt_local[j]++] = req; + } + } while ( + ncchkioi_chunk_itr_next_ex (varp, starts[req], counts[req], citr, &cid, ostart, osize)); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_PACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Post send + for (i = 0; i < nsend; i++) { + CHK_ERR_ISEND (sbuf[i], ssize[i], MPI_BYTE, sdst[i], 0, ncchkp->comm, sreq + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REP) + + // Post receive + for (i = 0; i < nsend; i++) { + CHK_ERR_IRECV (rbuf_re[i], rsize_re[i], MPI_BYTE, sdst[i], 1, ncchkp->comm, rreq_re + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REP) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Post recv + for (i = 0; i < nrecv; i++) { + // Get message size, including metadata + CHK_ERR_MPROBE (MPI_ANY_SOURCE, 0, ncchkp->comm, &rmsg, &rstat); + CHK_ERR_GET_COUNT (&rstat, MPI_BYTE, rsize + i); + + // Allocate buffer + rbuf[i] = rbufp[i] = (char *)NCI_Malloc (rsize[i]); + + // Post irecv + CHK_ERR_IMRECV (rbuf[i], rsize[i], MPI_BYTE, &rmsg, rreq + i); + } + + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET_CB_RECV_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SELF) + + tbuf = (char *)NCI_Malloc (varp->chunksize); + + // Handle our own data + for (req = 0; req < nreq; req++) { + ncchkioi_chunk_itr_init_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + if (varp->chunk_owner[cid] == ncchkp->rank) { + // Pack type from chunk cache to (contiguous) intermediate buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - citr[j]); + tsize[j] = varp->chunkdim[j]; + tssize[j] = (int)osize[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into intermediate buffer + packoff = 0; + CHK_ERR_PACK (varp->chunk_cache[cid]->buf, 1, ptype, tbuf, varp->chunksize, + &packoff, ncchkp->comm); + MPI_Type_free (&ptype); + overlapsize = packoff; + + // Pack type from (contiguous) intermediate buffer to chunk buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - starts[req][j]); + tsize[j] = (int)counts[req][j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Unpack data into chunk buffer + packoff = 0; + CHK_ERR_UNPACK (tbuf, overlapsize, &packoff, bufs[req], 1, ptype, ncchkp->comm); + MPI_Type_free (&ptype); + } + } while ( + ncchkioi_chunk_itr_next_ex (varp, starts[req], counts[req], citr, &cid, ostart, osize)); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SELF) + + // Handle incoming requests + for (i = 0; i < varp->ndim; i++) { tsize[i] = varp->chunkdim[i]; } + for (i = 0; i < nrecv; i++) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Will wait any provide any benefit? + MPI_Waitany (nrecv, rreq, &j, &rstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REQ) + + packoff = 0; + ssize_re[j] = *((int *)rbufp[j]); + rbufp[j] += sizeof (int); + sbuf_re[j] = (char *)NCI_Malloc (ssize_re[j]); + while (rbufp[j] < rbuf[j] + rsize[j]) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + + // Metadata + cid = *((int *)rbufp[j]); + rbufp[j] += sizeof (int); + tstartp = (int *)rbufp[j]; + rbufp[j] += sizeof (int) * varp->ndim; + tssizep = (int *)rbufp[j]; + rbufp[j] += sizeof (int) * varp->ndim; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_PACK_REP) + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstartp, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Data + CHK_ERR_PACK (varp->chunk_cache[cid]->buf, 1, ptype, sbuf_re[j], ssize_re[j], &packoff, + ncchkp->comm); + MPI_Type_free (&ptype); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_PACK_REP) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Send Response + CHK_ERR_ISEND (sbuf_re[j], packoff, MPI_BYTE, rstat.MPI_SOURCE, 1, ncchkp->comm, + sreq_re + j); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Wait for all request + CHK_ERR_WAITALL (nsend, sreq, sstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Handle reply + memset (rcnt_local, 0, sizeof (int) * nsend); + for (i = 0; i < nsend; i++) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REP) + + // Will wait any provide any benefit? + MPI_Waitany (nsend, rreq_re, &j, &rstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REP) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REP) + + sbufp[j] = sbuf[j] + sizeof (int); // Skip reply size + packoff = 0; + while (packoff < rsize_re[j]) { + // Retrieve metadata from the request we sent + cid = *((int *)sbufp[j]); + sbufp[j] += sizeof (int); + tstartp = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + tssizep = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + + // Bring up the request + req = reqs[j][rcnt_local[j]++]; + get_chunk_itr (varp, cid, citr); + for (k = 0; k < varp->ndim; k++) { + tstartp[k] += (int)(citr[k] - starts[req][k]); + tsize[k] = counts[req][k]; + } + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstartp, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data + CHK_ERR_UNPACK (rbuf_re[j], rsize_re[j], &packoff, bufs[req], 1, ptype, ncchkp->comm); + MPI_Type_free (&ptype); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REP) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REP) + + // Wait for all Response + CHK_ERR_WAITALL (nrecv, sreq_re, sstat_re); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REP) + +err_out:; + + // Free buffers + NCI_Free (rcnt_local); + + NCI_Free (rids); + + NCI_Free (tsize); + + NCI_Free (ostart); + + NCI_Free (sreq); + NCI_Free (sstat); + NCI_Free (ssize); + for (i = 0; i < nsend; i++) { NCI_Free (reqs[i]); } + for (i = 0; i < nsend + nrecv; i++) { + NCI_Free (sbuf[i]); + NCI_Free (rbuf[i]); + } + NCI_Free (sbuf); + NCI_Free (reqs); + + NCI_Free (rreq); + NCI_Free (rbuf); + NCI_Free (rsize); + + if (tbuf != NULL) { NCI_Free (tbuf); } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB) + + return err; +} + +int ncchkioi_get_varn (NC_chk *ncchkp, + NC_chk_var *varp, + int nreq, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + const void *buf) { + int i, j; + MPI_Offset rsize; + char *bptr = (char *)buf; + char **bufs; + + // Calculate buffer offset of each request + bufs = (char **)NCI_Malloc (sizeof (char *) * nreq); + for (i = 0; i < nreq; i++) { + bufs[i] = bptr; + rsize = varp->esize; + for (j = 0; j < varp->ndim; j++) { rsize *= counts[i][j]; } + bptr += rsize; + } + + // Collective buffer + switch (ncchkp->comm_unit) { + case NC_CHK_COMM_CHUNK: + ncchkioi_get_varn_cb_chunk (ncchkp, varp, nreq, starts, counts, NULL, (void **)bufs); + break; + case NC_CHK_COMM_PROC: + ncchkioi_get_varn_cb_proc (ncchkp, varp, nreq, starts, counts, (void **)bufs); + break; + } + NCI_Free (bufs); + + return NC_NOERR; +} diff --git a/src/drivers/ncchunkio/ncchkioi_iget.c b/src/drivers/ncchunkio/ncchkioi_iget.c new file mode 100644 index 0000000000..1484851995 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_iget.c @@ -0,0 +1,226 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_get_var__all() : dispatcher->get_var() + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include "ncchkio_internal.h" + +static inline int +ncchkioi_init_get_req( NC_chk *ncchkp, + NC_chk_req *req, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype) { + NC_chk_var *varp = ncchkp->vars.data + varid; + + // Zero out the request + memset(req, 0, sizeof(NC_chk_req)); + + // Record request + req->starts = (MPI_Offset**)NCI_Malloc(sizeof(MPI_Offset*)); + req->start = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * varp->ndim); + req->starts[0] = req->start; + memcpy(req->start, start, sizeof(MPI_Offset) * varp->ndim); + req->counts = (MPI_Offset**)NCI_Malloc(sizeof(MPI_Offset*)); + req->count = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * varp->ndim); + req->counts[0] = req->count; + memcpy(req->count, count, sizeof(MPI_Offset) * varp->ndim); + if (stride != NULL){ + req->stride = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * varp->ndim); + memcpy(req->stride, stride, sizeof(MPI_Offset) * varp->ndim); + } + + req->varid = varid; + req->buf = (void*)buf; + req->nreq = 1; + req->buftype = buftype; + if (varp->etype != buftype){ + if (bufcount > 0){ + req->bufcount = bufcount; + } + else{ + int i; + + req->bufcount = 1; + for(i = 0; i < varp->ndim; i++){ + req->bufcount *= count[i]; + } + } + + req->xbuf = (char*)NCI_Malloc(req->bufcount * varp->esize); + } + else{ + req->xbuf = req->buf; + } + + req->xbufs = (char**)NCI_Malloc(sizeof(char*)); + req->xbufs[0] = req->xbuf; + + return NC_NOERR; +} + +int +ncchkioi_iget_var(NC_chk *ncchkp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid) +{ + int err=NC_NOERR; + int req_id; + NC_chk_req req; + + // Init request + err = ncchkioi_init_get_req(ncchkp, &req, varid, start, count, stride, imap, buf, bufcount, buftype); + + // Add to req list + ncchkioi_req_list_add(&(ncchkp->getlist), &req_id); + ncchkp->getlist.reqs[req_id] = req; + + if (reqid != NULL){ + *reqid = req_id * 2; + } + + return err; +} + +static inline int +ncchkioi_init_get_varn_req( NC_chk *ncchkp, + NC_chk_req *req, + int varid, + int nreq, + MPI_Offset *const*starts, + MPI_Offset *const*counts, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype) { + int i, j; + MPI_Offset rsize, boff; + NC_chk_var *varp = ncchkp->vars.data + varid; + + // Zero out the request + memset(req, 0, sizeof(NC_chk_req)); + + // Record request + req->starts = (MPI_Offset**)NCI_Malloc(sizeof(MPI_Offset*) * nreq); + req->start = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * varp->ndim * nreq); + for(i = 0; i < nreq; i++){ + req->starts[i] = req->start + i * varp->ndim; + memcpy(req->starts[i], starts[i], sizeof(MPI_Offset) * varp->ndim); + } + req->counts = (MPI_Offset**)NCI_Malloc(sizeof(MPI_Offset*) * nreq); + req->count = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * varp->ndim * nreq); + for(i = 0; i < nreq; i++){ + req->counts[i] = req->count + i * varp->ndim; + memcpy(req->counts[i], counts[i], sizeof(MPI_Offset) * varp->ndim); + } + + req->varid = varid; + req->buf = (void*)buf; + req->xbuf = (void*)buf; + req->nreq = nreq; + req->buftype = buftype; + if (varp->etype != buftype){ + if (bufcount > 0){ + req->bufcount = bufcount; + } + else{ + req->bufcount = 0; + for(i = 0; i < nreq; i++){ + rsize = 1; + for(j = 0; j < varp->ndim; j++){ + rsize *= counts[i][j]; + } + req->bufcount += rsize; + } + } + + req->xbuf = (char*)NCI_Malloc(req->bufcount * varp->esize); + } + else{ + req->xbuf = req->buf; + } + + // Calculate buffer for each individual request + req->xbufs = (char**)NCI_Malloc(sizeof(char*) * nreq); + boff = 0; + for(i = 0; i < nreq; i++){ + req->xbufs[i] = (req->xbuf + boff); + + // Advance pointer by size of the request + rsize = varp->esize; + for(j = 0; j < varp->ndim; j++){ + rsize *= counts[i][j]; + } + boff += rsize; + } + + return NC_NOERR; +} + +int +ncchkioi_iget_varn(NC_chk *ncchkp, + int varid, + int nreq, + MPI_Offset * const*starts, + MPI_Offset * const*counts, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int *reqid) +{ + int err=NC_NOERR; + int req_id; + NC_chk_req req; + + if (nreq > 1){ + err = ncchkioi_init_get_varn_req(ncchkp, &req, varid, nreq, starts, counts, buf, bufcount, buftype); + } + else{ + err = ncchkioi_init_get_req(ncchkp, &req, varid, starts[0], counts[0], NULL, NULL, buf, bufcount, buftype); + } + + // Add to req list + ncchkioi_req_list_add(&(ncchkp->getlist), &req_id); + ncchkp->getlist.reqs[req_id] = req; + + if (reqid != NULL){ + *reqid = req_id * 2; + } + + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_iget_cb.c b/src/drivers/ncchunkio/ncchkioi_iget_cb.c new file mode 100644 index 0000000000..e1aecc9784 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_iget_cb.c @@ -0,0 +1,683 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_get_var__all() : dispatcher->get_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +/* Out drive currently can handle only one variable at a time + * We pack all request as a large varn request + */ +int ncchkioi_iget_cb_chunk (NC_chk *ncchkp, int nreq, int *reqids, int *stats) { + int i, j; + int nvar; + int vid; // Iterators for variable id + int *varids; + int *nreqs; // Number of reqids in each variable + int *nums; // Number of reqs in each varn + int **vreqids; + int num, maxnum = 0; + MPI_Offset **starts, **counts; + char **bufs; + NC_chk_req *req; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_INIT) + + // Count total number of request in per variable for packed varn request + nums = (int *)NCI_Malloc (sizeof (int) * ncchkp->vars.cnt * 2); + nreqs = nums + ncchkp->vars.cnt; + memset (nums, 0, sizeof (int) * ncchkp->vars.cnt); + memset (nreqs, 0, sizeof (int) * ncchkp->vars.cnt); + for (i = 0; i < nreq; i++) { + req = ncchkp->getlist.reqs + reqids[i]; + nreqs[req->varid]++; + nums[req->varid] += req->nreq; + } + + /* Allocate a skip list of reqids for each vriable + * At the same time, we find out the number of starts and counts we need to allocate + */ + vreqids = (int **)NCI_Malloc (sizeof (int *) * ncchkp->vars.cnt); + vreqids[0] = (int *)NCI_Malloc (sizeof (int) * nreq); + maxnum = 0; + i = 0; + nvar = 0; + for (vid = 0; vid < ncchkp->vars.cnt; vid++) { + if (nreqs[vid] > 0) { + // Assign buffer to reqid skip list + vreqids[vid] = vreqids[0] + i; + i += nreqs[vid]; + + // maximum number of starts and counts we need across all variables + if (maxnum < nums[vid]) { maxnum = nums[vid]; } + + // Number of variable that has request to write + nvar++; + } + } + + varids = (int *)NCI_Malloc (sizeof (int) * nvar); + + // Fill up the skip list + memset (nreqs, 0, sizeof (int) * ncchkp->vars.cnt); + for (i = 0; i < nreq; i++) { + req = ncchkp->getlist.reqs + reqids[i]; + vreqids[req->varid][nreqs[req->varid]++] = reqids[i]; + } + + // Allocate parameters + starts = (MPI_Offset **)NCI_Malloc (sizeof (MPI_Offset *) * maxnum * 2); + counts = starts + maxnum; + bufs = (char **)NCI_Malloc (sizeof (char *) * maxnum); + + /* Pack requests variable by variable + */ + nvar = 0; + for (vid = 0; vid < ncchkp->vars.cnt; vid++) { + if (nreqs[vid] > 0) { + // Fill varid in the skip list + varids[nvar++] = vid; + + // Collect parameters + num = 0; + for (j = 0; j < nreqs[vid]; j++) { + req = ncchkp->getlist.reqs + vreqids[vid][j]; + + if (req->nreq > 1) { + for (i = 0; i < req->nreq; i++) { + starts[num] = req->starts[i]; + counts[num] = req->counts[i]; + bufs[num++] = req->xbufs[i]; + } + } else { + starts[num] = req->start; + counts[num] = req->count; + bufs[num++] = req->xbuf; + } + } + + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET_CB) + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET_CB_INIT) + + // Perform collective buffering + ncchkioi_get_varn_cb_chunk (ncchkp, ncchkp->vars.data + vid, num, starts, counts, NULL, + (void **)bufs); + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_INIT) + } + } + + // Free buffers + NCI_Free (nums); + + NCI_Free (vreqids[0]); + NCI_Free (vreqids); + + NCI_Free (varids); + + NCI_Free (starts); + NCI_Free (bufs); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_INIT) + + return NC_NOERR; +} + +int ncchkioi_iget_cb_proc (NC_chk *ncchkp, int nreq, int *reqids, int *stats) { + int err=NC_NOERR; + int i, j, k; + int cid, cown; // Chunk iterator + int vid; + int r, **reqs; + + MPI_Offset *ostart, *osize; + int *tsize, *tssize, *tstart, *tssizep, *tstartp; // Size for sub-array type + MPI_Offset *citr; // Bounding box for chunks overlapping my own write region + + int *rcnt_local, *rcnt_all; // Number of processes that writes to each proc + + int nread; + int *rlo_local, *rhi_local; + int *rlo_all, *rhi_all; + int *rids; + + int overlapsize; // Size of overlaping region of request and chunk + char *tbuf = NULL; // Intermediate buffer + + int packoff; // Pack offset + MPI_Datatype ptype; // Pack datatype + MPI_Offset poff; // Offset of buffer to pack to/ from + int plen; + + int nsend, nrecv; // Number of send and receive + MPI_Request *sreq, *rreq, *sreq_re, *rreq_re; // Send and recv req + MPI_Status *sstat, rstat, *sstat_re; // Send and recv status + char **sbuf, **sbufp, **rbuf, **rbufp, **sbuf_re, **rbuf_re; // Send and recv buffer + int *rsize, *ssize, *rsize_re, *ssize_re; // recv size of each message + int *sdst; // recv size of each message + int *smap; + MPI_Message rmsg; // Receive message + NC_chk_var *varp; + NC_chk_req *req; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_INIT) + + // Allocate buffering for write count + rcnt_local = (int *)NCI_Malloc (sizeof (int) * ncchkp->np * 3); + rcnt_all = rcnt_local + ncchkp->np; + smap = rcnt_all + ncchkp->np; + + // Intermediate buffer for our own data + tbuf = (char *)NCI_Malloc (ncchkp->max_chunk_size); + + // Allocate buffering for overlaping index + tsize = (int *)NCI_Malloc (sizeof (int) * ncchkp->max_ndim * 3); + tssize = tsize + ncchkp->max_ndim; + tstart = tssize + ncchkp->max_ndim; + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * ncchkp->max_ndim * 3); + osize = ostart + ncchkp->max_ndim; + + // Chunk iterator + citr = osize + ncchkp->max_ndim; + + // Access range + rlo_local = (int *)NCI_Malloc (sizeof (int) * ncchkp->vars.cnt * 5); + rhi_local = rlo_local + ncchkp->vars.cnt; + rlo_all = rhi_local + ncchkp->vars.cnt; + rhi_all = rlo_all + ncchkp->vars.cnt; + rids = rhi_all + ncchkp->vars.cnt; + + for (i = 0; i < ncchkp->vars.cnt; i++) { + rlo_local[i] = 2147483647; + rhi_local[i] = -1; + } + + // We need to calculate the size of message of each chunk + // This is just for allocating send buffer + // We do so by iterating through all request and all chunks they cover + // If we are not the owner of a chunk, we need to send message + memset (rcnt_local, 0, sizeof (int) * ncchkp->np); + nsend = 0; + + // count total number of messages and build a map of accessed chunk to list of comm + // datastructure + for (i = 0; i < nreq; i++) { + req = ncchkp->getlist.reqs + reqids[i]; + varp = ncchkp->vars.data + req->varid; + for (r = 0; r < req->nreq; r++) { + ncchkioi_chunk_itr_init (varp, req->starts[r], req->counts[r], citr, + &cid); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + + // Mapping to skip list of send requests + if (rcnt_local[cown] == 0 && cown != ncchkp->rank) { smap[cown] = nsend++; } + rcnt_local[cown] = 1; // Need to send message if not owner + + if (rlo_local[req->varid] > cid) { rlo_local[req->varid] = cid; } + if (rhi_local[req->varid] < cid) { rhi_local[req->varid] = cid; } + } while (ncchkioi_chunk_itr_next (varp, req->starts[r], req->counts[r], citr, &cid)); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SYNC) + + // Sync number of messages of each chunk + CHK_ERR_ALLREDUCE (rcnt_local, rcnt_all, ncchkp->np, MPI_INT, MPI_SUM, ncchkp->comm); + nrecv = rcnt_all[ncchkp->rank] - + rcnt_local[ncchkp->rank]; // We don't need to receive request form self + +#ifdef PNETCDF_PROFILING + ncchkp->nsend += nrecv + nsend; + ncchkp->nrecv += nrecv + nsend; +#endif + + for (i = 0; i < ncchkp->vars.cnt; i++) { rhi_local[i] *= -1; } + CHK_ERR_ALLREDUCE (rlo_local, rlo_all, ncchkp->vars.cnt * 2, MPI_INT, MPI_MIN, ncchkp->comm); + for (i = 0; i < ncchkp->vars.cnt; i++) { rhi_all[i] *= -1; } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SYNC) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_PACK_REQ) + + // Allocate data structure for messaging + sbuf = (char **)NCI_Malloc (sizeof (char *) * (nsend * 2 + nrecv)); + sbufp = sbuf + (nsend + nrecv); + ssize = (int *)NCI_Malloc (sizeof (int) * (nsend * 2 + nrecv * 1)); + sdst = ssize + (nsend + nrecv); + sreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * (nsend + nrecv)); + sstat = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * (nsend + nrecv)); + reqs = (int **)NCI_Malloc (sizeof (int *) * nsend); + + rbuf = (char **)NCI_Malloc (sizeof (char *) * (nsend + nrecv * 2)); + rbufp = rbuf + (nsend + nrecv); + rsize = (int *)NCI_Malloc (sizeof (int) * (nsend + nrecv)); + rreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * (nsend + nrecv)); + + sbuf_re = sbuf + nsend; + ssize_re = ssize + nsend; + sreq_re = sreq + nsend; + sstat_re = sstat + nsend; + + rbuf_re = rbuf + nrecv; + rsize_re = rsize + nrecv; + rreq_re = rreq + nrecv; + + // req->counts[r] size of each request + memset (ssize, 0, sizeof (int) * nsend); + memset (rsize_re, 0, sizeof (int) * nsend); + memset (rcnt_local, 0, sizeof (int) * nsend); + for (i = 0; i < nreq; i++) { + req = ncchkp->getlist.reqs + reqids[i]; + varp = ncchkp->vars.data + req->varid; + for (r = 0; r < req->nreq; r++) { + ncchkioi_chunk_itr_init_ex (varp, req->starts[r], req->counts[r], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + sdst[j] = cown; // Record a reverse map by the way + + // Count overlap + overlapsize = varp->esize; + for (k = 0; k < varp->ndim; k++) { overlapsize *= osize[k]; } + ssize[j] += sizeof (int) * (varp->ndim * 2 + 2); + rsize_re[j] += overlapsize; + rcnt_local[j]++; + } + } while (ncchkioi_chunk_itr_next_ex (varp, req->starts[r], req->counts[r], citr, &cid, + ostart, osize)); + } + } + + // Allocate buffer for send + for (i = 0; i < nsend; i++) { + ssize[i] += sizeof (int); +#ifdef PNETCDF_DEBUG + assert (ssize[i] >= 0); +#endif + sbuf[i] = sbufp[i] = (char *)NCI_Malloc (ssize[i]); + *((int *)sbufp[i]) = rsize_re[i]; + sbufp[i] += sizeof (int); + rbuf_re[i] = (char *)NCI_Malloc (rsize_re[i]); + reqs[i] = (int *)NCI_Malloc (sizeof (int) * rcnt_local[i] * 2); +#ifdef PNETCDF_PROFILING + ncchkp->sendsize += ssize[i]; + ncchkp->recvsize += rsize_re[i]; +#endif + } + + // Pack requests + memset (rcnt_local, 0, sizeof (int) * nsend); + for (i = 0; i < nreq; i++) { + req = ncchkp->getlist.reqs + reqids[i]; + varp = ncchkp->vars.data + req->varid; + for (r = 0; r < req->nreq; r++) { + ncchkioi_chunk_itr_init_ex (varp, req->starts[r], req->counts[r], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + + // Pack metadata + *((int *)sbufp[j]) = varp->varid; + sbufp[j] += sizeof (int); + *((int *)sbufp[j]) = cid; + sbufp[j] += sizeof (int); + tstartp = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + tssizep = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + for (k = 0; k < varp->ndim; k++) { + tstartp[k] = (int)(ostart[k] - citr[k]); + tssizep[k] = (int)osize[k]; + } + + // Record source of the request + reqs[j][rcnt_local[j]++] = i; + reqs[j][rcnt_local[j]++] = r; + +#ifdef PNETCDF_PROFILING + ncchkp->nremote++; +#endif + } + } while (ncchkioi_chunk_itr_next_ex (varp, req->starts[r], req->counts[r], citr, &cid, + ostart, osize)); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_PACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Post send + for (i = 0; i < nsend; i++) { + CHK_ERR_ISEND (sbuf[i], ssize[i], MPI_BYTE, sdst[i], 0, ncchkp->comm, sreq + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REP) + + // Post receive + for (i = 0; i < nsend; i++) { + CHK_ERR_IRECV (rbuf_re[i], rsize_re[i], MPI_BYTE, sdst[i], 1, ncchkp->comm, rreq_re + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REP) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Post recv + for (i = 0; i < nrecv; i++) { + // Get message size, including metadata + CHK_ERR_MPROBE (MPI_ANY_SOURCE, 0, ncchkp->comm, &rmsg, &rstat); + CHK_ERR_GET_COUNT (&rstat, MPI_BYTE, rsize + i); + + // Allocate buffer + rbuf[i] = rbufp[i] = (char *)NCI_Malloc (rsize[i]); +#ifdef PNETCDF_PROFILING + ncchkp->recvsize += rsize[i]; +#endif + // Post irecv + CHK_ERR_IMRECV (rbuf[i], rsize[i], MPI_BYTE, &rmsg, rreq + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REQ) + + nread = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (rhi_all[i] >= rlo_all[i]) { + varp = ncchkp->vars.data + i; + rids[nread] = i; + for (j = 0; j < varp->nmychunk && varp->mychunks[j] < rlo_all[i]; j++) + ; + for (k = j; k < varp->nmychunk && varp->mychunks[k] <= rhi_all[i]; k++) + ; + rlo_all[nread] = j; + rhi_all[nread++] = k; + } + } + + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET_CB) +#ifdef PNETCDF_PROFILING + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_BARR) + MPI_Barrier (ncchkp->comm); + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_BARR) +#endif + err = ncchkioi_load_nvar (ncchkp, nread, rids, rlo_all, rhi_all); + CHK_ERR + // Increase batch number to indicate allocated chunk buffer can be freed for future allocation + (ncchkp->cache_serial)++; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB) + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SELF) + + // Handle our own data + for (i = 0; i < nreq; i++) { + req = ncchkp->getlist.reqs + reqids[i]; + varp = ncchkp->vars.data + req->varid; + for (r = 0; r < req->nreq; r++) { + ncchkioi_chunk_itr_init_ex (varp, req->starts[r], req->counts[r], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + if (varp->chunk_owner[cid] == ncchkp->rank) { + // Pack type from chunk cache to (contiguous) intermediate buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - citr[j]); + tsize[j] = varp->chunkdim[j]; + tssize[j] = (int)osize[j]; + } + err = + ncchkioi_subarray_off_len (varp->ndim, tsize, tssize, tstart, &poff, &plen); + if (err == 0) { + plen *= varp->esize; + poff *= varp->esize; + memcpy (tbuf, varp->chunk_cache[cid]->buf + poff, plen); + overlapsize = plen; + } else { + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, + MPI_ORDER_C, varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into intermediate buffer + packoff = 0; + CHK_ERR_PACK (varp->chunk_cache[cid]->buf, 1, ptype, tbuf, varp->chunksize, + &packoff, ncchkp->comm); + MPI_Type_free (&ptype); + overlapsize = packoff; + } + + // Pack type from (contiguous) intermediate buffer to chunk buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - req->starts[r][j]); + tsize[j] = (int)req->counts[r][j]; + } + err = + ncchkioi_subarray_off_len (varp->ndim, tsize, tssize, tstart, &poff, &plen); + if (err == 0) { + plen *= varp->esize; + poff *= varp->esize; + memcpy (req->xbufs[r] + poff, tbuf, plen); + } else { + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, + MPI_ORDER_C, varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Unpack data into chunk buffer + packoff = 0; + CHK_ERR_UNPACK (tbuf, overlapsize, &packoff, req->xbufs[r], 1, ptype, + ncchkp->comm); + MPI_Type_free (&ptype); + } +#ifdef PNETCDF_PROFILING + ncchkp->nlocal++; +#endif + } + } while (ncchkioi_chunk_itr_next_ex (varp, req->starts[r], req->counts[r], citr, &cid, + ostart, osize)); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SELF) + + // Handle incoming requests + for (i = 0; i < nrecv; i++) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REQ) + + // Will wait any provide any benefit? + MPI_Waitany (nrecv, rreq, &j, &rstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REQ) + + packoff = 0; + ssize_re[j] = *((int *)rbufp[j]); + rbufp[j] += sizeof (int); +#ifdef PNETCDF_DEBUG + assert (ssize_re[j] >= 0); +#endif + sbuf_re[j] = (char *)NCI_Malloc (ssize_re[j]); + while (rbufp[j] < rbuf[j] + rsize[j]) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + + // Retrieve metadata + vid = *((int *)rbufp[j]); + rbufp[j] += sizeof (int); + cid = *((int *)rbufp[j]); + rbufp[j] += sizeof (int); + varp = ncchkp->vars.data + vid; + tstartp = (int *)rbufp[j]; + rbufp[j] += sizeof (int) * varp->ndim; + tssizep = (int *)rbufp[j]; + rbufp[j] += sizeof (int) * varp->ndim; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_PACK_REP) + + err = ncchkioi_subarray_off_len (varp->ndim, varp->chunkdim, tssizep, tstartp, &poff, + &plen); + if (err == 0) { + plen *= varp->esize; + poff *= varp->esize; + memcpy (sbuf_re[j] + packoff, varp->chunk_cache[cid]->buf + poff, plen); + packoff += plen; + } else { + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, varp->chunkdim, tssizep, tstartp, + MPI_ORDER_C, varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data + CHK_ERR_PACK (varp->chunk_cache[cid]->buf, 1, ptype, sbuf_re[j], ssize_re[j], + &packoff, ncchkp->comm); + MPI_Type_free (&ptype); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_PACK_REP) +#ifdef PNETCDF_PROFILING + ncchkp->nreq++; +#endif + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Send Response + CHK_ERR_ISEND (sbuf_re[j], packoff, MPI_BYTE, rstat.MPI_SOURCE, 1, ncchkp->comm, + sreq_re + j); +#ifdef PNETCDF_PROFILING + ncchkp->sendsize += packoff; +#endif + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_GET_CB_SEND_REQ) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Wait for all request + CHK_ERR_WAITALL (nsend, sreq, sstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REQ) + + // Handle reply + memset (rcnt_local, 0, sizeof (int) * nsend); + for (i = 0; i < nsend; i++) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_RECV_REP) + + // Will wait any provide any benefit? + MPI_Waitany (nsend, rreq_re, &j, &rstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_RECV_REP) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_UNPACK_REP) + + sbufp[j] = sbuf[j] + sizeof (int); // Skip reply size + packoff = 0; + while (packoff < rsize_re[j]) { + // Retrieve metadata from the request we sent + vid = *((int *)sbufp[j]); + sbufp[j] += sizeof (int); + cid = *((int *)sbufp[j]); + sbufp[j] += sizeof (int); + varp = ncchkp->vars.data + vid; + tstartp = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + tssizep = (int *)sbufp[j]; + sbufp[j] += sizeof (int) * varp->ndim; + + k = reqs[j][rcnt_local[j]++]; + r = reqs[j][rcnt_local[j]++]; + req = ncchkp->getlist.reqs + reqids[k]; + get_chunk_itr (varp, cid, citr); + for (k = 0; k < varp->ndim; k++) { + tstartp[k] += (int)(citr[k] - req->starts[r][k]); + tsize[k] = req->counts[r][k]; + } + + err = ncchkioi_subarray_off_len (varp->ndim, tsize, tssizep, tstartp, &poff, &plen); + if (err == 0) { + plen *= varp->esize; + poff *= varp->esize; + memcpy (req->xbufs[r] + poff, rbuf_re[j] + packoff, plen); + packoff += plen; + } else { + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstartp, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data + CHK_ERR_UNPACK (rbuf_re[j], rsize_re[j], &packoff, req->xbufs[r], 1, ptype, + ncchkp->comm); + MPI_Type_free (&ptype); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_UNPACK_REP) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CB_SEND_REP) + + // Wait for all Response + CHK_ERR_WAITALL (nrecv, sreq_re, sstat_re); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB_SEND_REP) + + // Free buffers + NCI_Free (rcnt_local); + + NCI_Free (tsize); + + NCI_Free (ostart); + + NCI_Free (tbuf); + + NCI_Free (sreq); + NCI_Free (sstat); + NCI_Free (ssize); + for (i = 0; i < nsend; i++) { NCI_Free (reqs[i]); } + for (i = 0; i < nsend + nrecv; i++) { + NCI_Free (sbuf[i]); + NCI_Free (rbuf[i]); + } + NCI_Free (sbuf); + NCI_Free (reqs); + + NCI_Free (rreq); + NCI_Free (rbuf); + NCI_Free (rsize); + + NCI_Free (rlo_local); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CB) + +err_out:; + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_iput.c b/src/drivers/ncchunkio/ncchkioi_iput.c new file mode 100644 index 0000000000..98e97633b0 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_iput.c @@ -0,0 +1,176 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +static inline int ncchkioi_init_put_req (NC_chk *ncchkp, + NC_chk_req *req, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const void *xbuf, + const void *buf) { + int err=NC_NOERR; + NC_chk_var *varp = ncchkp->vars.data + varid; + + // Zero out the request + memset (req, 0, sizeof (NC_chk_req)); + + // Record request + req->starts = (MPI_Offset **)NCI_Malloc (sizeof (MPI_Offset *)); + req->start = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim); + req->starts[0] = req->start; + memcpy (req->start, start, sizeof (MPI_Offset) * varp->ndim); + req->counts = (MPI_Offset **)NCI_Malloc (sizeof (MPI_Offset *)); + req->count = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim); + req->counts[0] = req->count; + memcpy (req->count, count, sizeof (MPI_Offset) * varp->ndim); + if (stride != NULL) { + req->stride = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim); + memcpy (req->stride, stride, sizeof (MPI_Offset) * varp->ndim); + } + + req->varid = varid; + req->buf = (void *)buf; + req->xbuf = (void *)xbuf; + req->xbufs = (char **)NCI_Malloc (sizeof (char *)); + req->xbufs[0] = req->xbuf; + req->nreq = 1; + + return err; +} + +int ncchkioi_iput_var (NC_chk *ncchkp, + int varid, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const void *xbuf, + const void *buf, + int *reqid) { + int err=NC_NOERR; + int req_id; + NC_chk_req req; + + err = ncchkioi_init_put_req (ncchkp, &req, varid, start, count, stride, xbuf, buf); + + // Add to req list + ncchkioi_req_list_add (&(ncchkp->putlist), &req_id); + ncchkp->putlist.reqs[req_id] = req; + + if (reqid != NULL) { *reqid = req_id * 2 + 1; } + + return err; +} + +static inline int ncchkioi_init_put_varn_req (NC_chk *ncchkp, + NC_chk_req *req, + int varid, + int nreq, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + const void *xbuf, + const void *buf) { + int err=NC_NOERR; + int i, j; + MPI_Offset rsize, boff; + NC_chk_var *varp = ncchkp->vars.data + varid; + + // Zero out the request + memset (req, 0, sizeof (NC_chk_req)); + + // Record request + req->starts = (MPI_Offset **)NCI_Malloc (sizeof (MPI_Offset *) * nreq); + CHK_PTR (req->starts) + req->start = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * nreq); + CHK_PTR (req->start) + for (i = 0; i < nreq; i++) { + req->starts[i] = req->start + i * varp->ndim; + memcpy (req->starts[i], starts[i], sizeof (MPI_Offset) * varp->ndim); + } + req->counts = (MPI_Offset **)NCI_Malloc (sizeof (MPI_Offset *) * nreq); + CHK_PTR (req->counts) + req->count = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * nreq); + CHK_PTR (req->count) + for (i = 0; i < nreq; i++) { + req->counts[i] = req->count + i * varp->ndim; + memcpy (req->counts[i], counts[i], sizeof (MPI_Offset) * varp->ndim); + } + + // Calculate buffer for each individual request + req->xbufs = (char **)NCI_Malloc (sizeof (char *) * nreq); + CHK_PTR (req->xbufs) + boff = 0; + for (i = 0; i < nreq; i++) { + req->xbufs[i] = (((char *)xbuf) + boff); + + // Advance pointer by size of the request + rsize = varp->esize; + for (j = 0; j < varp->ndim; j++) { rsize *= counts[i][j]; } + boff += rsize; + } + + req->varid = varid; + req->buf = (void *)buf; + req->xbuf = (void *)xbuf; + req->nreq = nreq; + +err_out:; + return err; +} + +int ncchkioi_iput_varn (NC_chk *ncchkp, + int varid, + int nreq, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + const void *xbuf, + const void *buf, + int *reqid) { + int err=NC_NOERR; + int req_id; + NC_chk_req req; + + if (nreq > 1) { + err = ncchkioi_init_put_varn_req (ncchkp, &req, varid, nreq, starts, counts, xbuf, buf); + } else { + err = ncchkioi_init_put_req (ncchkp, &req, varid, starts[0], counts[0], NULL, xbuf, buf); + } + CHK_ERR + + // Add to req list + err = ncchkioi_req_list_add (&(ncchkp->putlist), &req_id); + CHK_ERR + ncchkp->putlist.reqs[req_id] = req; + + if (reqid != NULL) { *reqid = req_id * 2 + 1; } + +err_out:; + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_iput_cb.c b/src/drivers/ncchunkio/ncchkioi_iput_cb.c new file mode 100644 index 0000000000..1abf48eb21 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_iput_cb.c @@ -0,0 +1,620 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +/* Out drive currently can handle only one variable at a time + * We pack all request as a large varn request + */ +int ncchkioi_iput_cb_chunk (NC_chk *ncchkp, int nreq, int *reqids, int *stats) { + int i, j; + int vid; // Iterators for variable id + int *nreqs; // Number of reqids in each variable + int *nums; // Number of reqs in each varn + int **vreqids; + int num, maxnum = 0; + MPI_Offset **starts, **counts; + char **bufs; + NC_chk_req *req; + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + + // Count total number of request in per variable for packed varn request + nums = (int *)NCI_Malloc (sizeof (int) * ncchkp->vars.cnt * 2); + nreqs = nums + ncchkp->vars.cnt; + memset (nums, 0, sizeof (int) * ncchkp->vars.cnt); + memset (nreqs, 0, sizeof (int) * ncchkp->vars.cnt); + for (i = 0; i < nreq; i++) { + req = ncchkp->putlist.reqs + reqids[i]; + nreqs[req->varid]++; + nums[req->varid] += req->nreq; + } + + /* Allocate a skip list of reqids for each vriable + * At the same time, we find out the number of starts and counts we need to allocate + */ + vreqids = (int **)NCI_Malloc (sizeof (int *) * ncchkp->vars.cnt); + vreqids[0] = (int *)NCI_Malloc (sizeof (int) * nreq); + maxnum = 0; + i = 0; + for (vid = 0; vid < ncchkp->vars.cnt; vid++) { + if (nreqs[vid] > 0) { + // Assign buffer to reqid skip list + vreqids[vid] = vreqids[0] + i; + i += nreqs[vid]; + + // maximum number of starts and counts we need across all variables + if (maxnum < nums[vid]) { maxnum = nums[vid]; } + } + } + + // Fill up the skip list + memset (nreqs, 0, sizeof (int) * ncchkp->vars.cnt); + for (i = 0; i < nreq; i++) { + req = ncchkp->putlist.reqs + reqids[i]; + vreqids[req->varid][nreqs[req->varid]++] = reqids[i]; + } + + // Allocate parameters + starts = (MPI_Offset **)NCI_Malloc (sizeof (MPI_Offset *) * maxnum * 2); + counts = starts + maxnum; + bufs = (char **)NCI_Malloc (sizeof (char *) * maxnum); + + /* Pack requests variable by variable + */ + for (vid = 0; vid < ncchkp->vars.cnt; vid++) { + if (nreqs[vid] > 0) { + // Collect parameters + num = 0; + for (j = 0; j < nreqs[vid]; j++) { + req = ncchkp->putlist.reqs + vreqids[vid][j]; + + for (i = 0; i < req->nreq; i++) { + starts[num] = req->starts[i]; + counts[num] = req->counts[i]; + bufs[num++] = req->xbufs[i]; + } + } + + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_PUT_CB) + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_PUT_CB_INIT) + + // Perform collective buffering + ncchkioi_put_varn_cb_chunk (ncchkp, ncchkp->vars.data + vid, num, starts, counts, NULL, + (void **)bufs); + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + } + } + + // Free buffers + NCI_Free (nums); + + NCI_Free (vreqids[0]); + NCI_Free (vreqids); + + NCI_Free (starts); + NCI_Free (bufs); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_INIT) + + return NC_NOERR; +} + +/* Out drive currently can handle only one variable at a time + * We pack all request as a large varn request + */ +int ncchkioi_iput_cb_proc (NC_chk *ncchkp, int nreq, int *reqids, int *stats) { + int err=NC_NOERR; + int i, j, k; + int cid, cown; // Chunk iterator and owner + int vid; + int r; + MPI_Offset *ostart = NULL, *osize; + int *tsize, *tssize, *tstart = NULL, *tssizep, *tstartp; // Size for sub-array type + MPI_Offset *citr; // Bounding box for chunks overlapping my own write region + + int *wcnt_local = NULL, *wcnt_all; // Number of processes that writes to each chunk + + int nread; + int *rlo_local, *rhi_local; + int *rlo_all, *rhi_all; + int *rids; + + int overlapsize; // Size of overlaping region of request and chunk + char *tbuf = NULL; // Intermediate buffer + + int packoff; // Pack offset + MPI_Offset pboff; // Offset of buffer to pack to/ from + MPI_Datatype ptype; // Pack datatype + int plen; + + int nsend, nrecv; // Number of send and receive + MPI_Request *sreq = NULL, *rreq = NULL; // Send and recv req + MPI_Status *sstat = NULL, rstat; // Send and recv status + char **sbuf = NULL, **rbuf = NULL; // Send and recv buffer + char **sbufp, **rbufp; // Send and recv buffer pointer + int *rsize = NULL, *ssize = NULL; // Send and recv size of each message + MPI_Offset totalsize; + int *sdst; // recv size of each message + int *smap; + MPI_Message rmsg; // Receive message + NC_chk_var *varp; + NC_chk_req *req; + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + + // Allocate buffering for write count + wcnt_local = (int *)NCI_Malloc (sizeof (int) * ncchkp->np * 3); + CHK_PTR (wcnt_local) + wcnt_all = wcnt_local + ncchkp->np; + smap = wcnt_all + ncchkp->np; + + // Intermediate buffer for our own data + tbuf = (char *)NCI_Malloc (ncchkp->max_chunk_size); + CHK_PTR (tbuf) + + // Allocate buffering for overlaping index + tstart = (int *)NCI_Malloc (sizeof (int) * ncchkp->max_ndim * 3); + CHK_PTR (tstart) + tssize = tstart + ncchkp->max_ndim; + tsize = tssize + ncchkp->max_ndim; + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * ncchkp->max_ndim * 3); + CHK_PTR (ostart) + osize = ostart + ncchkp->max_ndim; + + // Chunk iterator + citr = osize + ncchkp->max_ndim; + + // Access range + rlo_local = (int *)NCI_Malloc (sizeof (int) * ncchkp->vars.cnt * 5); + CHK_PTR (rlo_local) + rhi_local = rlo_local + ncchkp->vars.cnt; + rlo_all = rhi_local + ncchkp->vars.cnt; + rhi_all = rlo_all + ncchkp->vars.cnt; + rids = rhi_all + ncchkp->vars.cnt; + + for (i = 0; i < ncchkp->vars.cnt; i++) { + rlo_local[i] = 2147483647; + rhi_local[i] = -1; + } + + // We need to calculate the size of message of each processes + // This is just for allocating send buffer + // We do so by iterating through all request and all chunks they cover + // If we are not the owner of a chunk, we need to send message + memset (wcnt_local, 0, sizeof (int) * ncchkp->np); + nsend = 0; + + // Count total number of messages and build a map of accessed chunk to list of comm + // datastructure + for (i = 0; i < nreq; i++) { + req = ncchkp->putlist.reqs + reqids[i]; + varp = ncchkp->vars.data + req->varid; + for (r = 0; r < req->nreq; r++) { + ncchkioi_chunk_itr_init (varp, req->starts[r], req->counts[r], citr, + &cid); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + + // Mapping to skip list of send requests + if (wcnt_local[cown] == 0 && cown != ncchkp->rank) { smap[cown] = nsend++; } + wcnt_local[cown] = 1; // Need to send message if not owner + + if (rlo_local[req->varid] > cid) { rlo_local[req->varid] = cid; } + if (rhi_local[req->varid] < cid) { rhi_local[req->varid] = cid; } + } while (ncchkioi_chunk_itr_next (varp, req->starts[r], req->counts[r], citr, &cid)); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SYNC) + + // Sync number of messages of each chunk + CHK_ERR_ALLREDUCE (wcnt_local, wcnt_all, ncchkp->np, MPI_INT, MPI_SUM, ncchkp->comm); + nrecv = wcnt_all[ncchkp->rank] - + wcnt_local[ncchkp->rank]; // We don't need to receive request form self + + for (i = 0; i < ncchkp->vars.cnt; i++) { rhi_local[i] *= -1; } + CHK_ERR_ALLREDUCE (rlo_local, rlo_all, ncchkp->vars.cnt * 2, MPI_INT, MPI_MIN, ncchkp->comm); + for (i = 0; i < ncchkp->vars.cnt; i++) { rhi_all[i] *= -1; } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SYNC) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_PACK_REQ) + + // Allocate data structure for messaging + sbuf = (char **)NCI_Malloc (sizeof (char *) * nsend * 2); + CHK_PTR (sbuf) + sbufp = sbuf + nsend; + ssize = (int *)NCI_Malloc (sizeof (int) * nsend * 2); + CHK_PTR (ssize) + sdst = ssize + nsend; + sreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nsend); + CHK_PTR (sreq) + sstat = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * nsend); + CHK_PTR (sstat) + + rbuf = (char **)NCI_Malloc (sizeof (char *) * nrecv * 2); + CHK_PTR (rbuf) + rbufp = rbuf + nrecv; + rsize = (int *)NCI_Malloc (sizeof (int) * nrecv); + CHK_PTR (rsize) + rreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nrecv); + CHK_PTR (rreq) + + // Count size of each request + memset (ssize, 0, sizeof (int) * nsend); + for (k = 0; k < nreq; k++) { + req = ncchkp->putlist.reqs + reqids[k]; + varp = ncchkp->vars.data + req->varid; + + for (r = 0; r < req->nreq; r++) { + ncchkioi_chunk_itr_init_ex (varp, req->starts[r], req->counts[r], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk index and owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + sdst[j] = cown; // Record a reverse map by the way + + // Count overlap + overlapsize = varp->esize; + for (i = 0; i < varp->ndim; i++) { overlapsize *= osize[i]; } + ssize[j] += overlapsize + sizeof (int) * (varp->ndim * 2 + 2); +#ifdef PNETCDF_DEBUG + if (ssize[j] < 0) { RET_ERR (NC_EAINT_TOO_SMALL) } +#endif + } + } while (ncchkioi_chunk_itr_next_ex (varp, req->starts[r], req->counts[r], citr, &cid, + ostart, osize)); + } + } + + // Allocate buffer for send + totalsize = 0; + for (i = 0; i < nsend; i++) { +#ifdef PNETCDF_DEBUG + assert (ssize[i] >= 0); +#endif + totalsize += ssize[i]; + } + if (nsend > 0) { + sbuf[0] = sbufp[0] = (char *)NCI_Malloc (totalsize); + CHK_PTR (sbuf[0]) + for (i = 1; i < nsend; i++) { sbuf[i] = sbufp[i] = sbuf[i - 1] + ssize[i - 1]; } + } +#ifdef PNETCDF_PROFILING + ncchkp->nsend += nsend; + ncchkp->sendsize += totalsize; +#endif + + // Pack requests + for (k = 0; k < nreq; k++) { + req = ncchkp->putlist.reqs + reqids[k]; + varp = ncchkp->vars.data + req->varid; + + for (r = 0; r < req->nreq; r++) { + ncchkioi_chunk_itr_init_ex (varp, req->starts[r], req->counts[r], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk index and owner + cown = varp->chunk_owner[cid]; + + if (cown != ncchkp->rank) { + j = smap[cown]; + + // Pack metadata + *((int *)(sbufp[j])) = req->varid; + sbufp[j] += sizeof (int); + *((int *)(sbufp[j])) = cid; + sbufp[j] += sizeof (int); + tstartp = (int *)sbufp[j]; + sbufp[j] += varp->ndim * sizeof (int); + tssizep = (int *)sbufp[j]; + sbufp[j] += varp->ndim * sizeof (int); + + for (i = 0; i < varp->ndim; i++) { + tstartp[i] = (int)(ostart[i] - citr[i]); + tssizep[i] = (int)osize[i]; + } + + // Pack type from user buffer to send buffer + for (i = 0; i < varp->ndim; i++) { + tsize[i] = (int)req->counts[r][i]; + tstart[i] = (int)(ostart[i] - req->starts[r][i]); + } + err = ncchkioi_subarray_off_len (varp->ndim, tsize, tssizep, tstart, &pboff, + &plen); + if (err == 0) { + plen *= varp->esize; + pboff *= varp->esize; +#ifdef PNETCDF_DEBUG + if (sbufp[j] - sbuf[j] + plen > ssize[j]) { RET_ERR (NC_EINTERNAL) } +#endif + memcpy (sbufp[j], req->xbufs[r] + pboff, plen); + sbufp[j] += plen; + } else { + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstart, + MPI_ORDER_C, varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data + packoff = 0; + CHK_ERR_PACK (req->xbufs[r], 1, ptype, sbufp[j], ssize[j], &packoff, + ncchkp->comm); + sbufp[j] += packoff; + MPI_Type_free (&ptype); + } + +#ifdef PNETCDF_PROFILING + ncchkp->nremote++; +#endif + } + } while (ncchkioi_chunk_itr_next_ex (varp, req->starts[r], req->counts[r], citr, &cid, + ostart, osize)); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_PACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + // Post send + for (i = 0; i < nsend; i++) { + CHK_ERR_ISEND (sbuf[i], ssize[i], MPI_BYTE, sdst[i], 0, ncchkp->comm, sreq + i); + if (ssize[i] == 24 && sdst[i] == 983) { + printf ("rank %d: wrong ssize to 983\n", ncchkp->rank); + abort (); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SEND_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + // Post recv + for (i = 0; i < nrecv; i++) { + // Get message size, including metadata + CHK_ERR_MPROBE (MPI_ANY_SOURCE, 0, ncchkp->comm, &rmsg, &rstat); + CHK_ERR_GET_COUNT (&rstat, MPI_BYTE, rsize + i); + + // Allocate buffer + rbuf[i] = rbufp[i] = (char *)NCI_Malloc (rsize[i]); + CHK_PTR (rbuf[i]) + +#ifdef PNETCDF_PROFILING + ncchkp->recvsize += rsize[i]; +#endif + + // Post irecv + CHK_ERR_IMRECV (rbuf[i], rsize[i], MPI_BYTE, &rmsg, rreq + i); + } +#ifdef PNETCDF_PROFILING + ncchkp->nrecv += nrecv; +#endif + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + nread = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (rhi_all[i] >= rlo_all[i]) { + varp = ncchkp->vars.data + i; + rids[nread] = i; + for (j = 0; j < varp->nmychunk && varp->mychunks[j] < rlo_all[i]; j++) + ; + for (k = j; k < varp->nmychunk && varp->mychunks[k] <= rhi_all[i]; k++) + ; + rlo_all[nread] = j; + rhi_all[nread++] = k; + } + } + +#ifdef PNETCDF_PROFILING + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_BARR) + MPI_Barrier (ncchkp->comm); + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_BARR) +#endif + + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_PUT_CB) + err = ncchkioi_load_nvar_bg (ncchkp, nread, rids, rlo_all, rhi_all); + CHK_ERR + // Increase batch number to indicate allocated chunk buffer can be freed for future + // allocation + (ncchkp->cache_serial) + ++; + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SELF) + + // Handle our own data + for (k = 0; k < nreq; k++) { + req = ncchkp->putlist.reqs + reqids[k]; + varp = ncchkp->vars.data + req->varid; + + for (r = 0; r < req->nreq; r++) { + ncchkioi_chunk_itr_init_ex (varp, req->starts[r], req->counts[r], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk index and owner + if (varp->chunk_owner[cid] == ncchkp->rank) { + // Pack type from user buffer to (contiguous) intermediate buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - req->starts[r][j]); + tsize[j] = (int)req->counts[r][j]; + tssize[j] = (int)osize[j]; + } + err = ncchkioi_subarray_off_len (varp->ndim, tsize, tssize, tstart, &pboff, + &plen); + if (err == 0) { + plen *= varp->esize; + pboff *= varp->esize; + memcpy (tbuf, req->xbufs[r] + pboff, plen); + overlapsize = plen; + } else { + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, + MPI_ORDER_C, varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into intermediate buffer + packoff = 0; + CHK_ERR_PACK (req->xbufs[r], 1, ptype, tbuf, varp->chunksize, &packoff, + ncchkp->comm); + MPI_Type_free (&ptype); + overlapsize = packoff; + } + + // Pack type from (contiguous) intermediate buffer to chunk buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - citr[j]); + tsize[j] = varp->chunkdim[j]; + } + err = ncchkioi_subarray_off_len (varp->ndim, tsize, tssize, tstart, &pboff, + &plen); + if (err == 0) { + plen *= varp->esize; + pboff *= varp->esize; + memcpy (varp->chunk_cache[cid]->buf + pboff, tbuf, plen); + } else { + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, + MPI_ORDER_C, varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Unpack data into chunk buffer + packoff = 0; + CHK_ERR_UNPACK (tbuf, overlapsize, &packoff, varp->chunk_cache[cid]->buf, 1, + ptype, ncchkp->comm); + MPI_Type_free (&ptype); + } + + // Mark chunk as dirty + varp->dirty[cid] = 1; +#ifdef PNETCDF_PROFILING + ncchkp->nlocal++; +#endif + } + } while (ncchkioi_chunk_itr_next_ex (varp, req->starts[r], req->counts[r], citr, &cid, + ostart, osize)); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SELF) + + // Handle incoming requests + for (i = 0; i < nrecv; i++) { + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + // Will wait any provide any benefit? + MPI_Waitany (nrecv, rreq, &j, &rstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_RECV_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_UNPACK_REQ) + + packoff = 0; + while (rbufp[j] - rbuf[j] < rsize[j]) { + // Retrieve metadata + vid = *((int *)(rbufp[j])); + rbufp[j] += sizeof (int); + cid = *((int *)(rbufp[j])); + rbufp[j] += sizeof (int); + varp = ncchkp->vars.data + vid; + tstartp = (int *)rbufp[j]; + rbufp[j] += varp->ndim * sizeof (int); + tssizep = (int *)rbufp[j]; + rbufp[j] += varp->ndim * sizeof (int); + + err = ncchkioi_subarray_off_len (varp->ndim, varp->chunkdim, tssizep, tstartp, &pboff, + &plen); + if (err == 0) { + plen *= varp->esize; + pboff *= varp->esize; +#ifdef PNETCDF_DEBUG + if (rbufp[j] - rbuf[j] + plen > rsize[j]) { RET_ERR (NC_EINTERNAL) } +#endif + memcpy (varp->chunk_cache[cid]->buf + pboff, rbufp[j], plen); + rbufp[j] += plen; + } else { + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, varp->chunkdim, tssizep, tstartp, + MPI_ORDER_C, varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data + packoff = 0; + CHK_ERR_UNPACK (rbufp[j], rsize[j], &packoff, varp->chunk_cache[cid]->buf, 1, ptype, + ncchkp->comm); + rbufp[j] += packoff; + MPI_Type_free (&ptype); + } + + // Mark chunk as dirty + varp->dirty[cid] = 1; + +#ifdef PNETCDF_PROFILING + ncchkp->nreq++; +#endif + } + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_UNPACK_REQ) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + CHK_ERR_WAITALL (nsend, sreq, sstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SEND_REQ) + +err_out:; + // Free buffers + NCI_Free (wcnt_local); + + NCI_Free (tstart); + + NCI_Free (ostart); + + NCI_Free (sreq); + NCI_Free (sstat); + NCI_Free (ssize); + if (nsend > 0) { NCI_Free (sbuf[0]); } + NCI_Free (sbuf); + + NCI_Free (rreq); + for (i = 0; i < nrecv; i++) { NCI_Free (rbuf[i]); } + NCI_Free (rbuf); + NCI_Free (rsize); + + NCI_Free (tbuf); + + NCI_Free (rlo_local); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB) + + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_lagacy.c b/src/drivers/ncchunkio/ncchkioi_lagacy.c new file mode 100644 index 0000000000..811a7e68c2 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_lagacy.c @@ -0,0 +1,719 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include "ncchkio_internal.h" + +#define min(a,b) (((a)<(b))?(a):(b)) +#define max(a,b) (((a)>(b))?(a):(b)) + +int +ncchkioi_get_var_old(NC_chk *ncchkp, + NC_chk_var *varp, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + const MPI_Offset *imap, + void *buf, + MPI_Offset bufcount, + MPI_Datatype buftype, + int reqMode) +{ + int i, j, err=NC_NOERR; + nc_type xtype; + int *cstart, *cend, *ccord; + int nb, bsize; + int datavarid; + int *bidx; + int *tsize, *tssize, *tstart; + int tpos; + MPI_Datatype subarytype; + char *rbuffer, *cbuffer; + MPI_Offset cbsize; + MPI_Offset **starts, **counts; + + // Boundary of chunks involved + cstart = (int*)NCI_Malloc(sizeof(int) * varp->ndim); + ccord = (int*)NCI_Malloc(sizeof(int) * varp->ndim); + cend = (int*)NCI_Malloc(sizeof(int) * varp->ndim); + for(i = 0; i < varp->ndim; i++){ + cstart[i] = start[i] / varp->chunkdim[i]; + if (stride == NULL){ + cend[i] = (start[i] + count[i] - 1) / varp->chunkdim[i]; + } + else{ + cend[i] = (start[i] + (count[i] - 1) * stride[i]) / varp->chunkdim[i] + 1; + } + } + + // Number of chunks involved + nb = 1; + for(i = 0; i < varp->ndim; i++){ + nb *= cend[i] - cstart[i]; + } + + /* Use a varn call to read all compressed chunk involved + * Generate one request for each chunk + */ + + bidx = (int*)NCI_Malloc(sizeof(int) * nb); + starts = (MPI_Offset**)NCI_Malloc(sizeof(MPI_Offset*) * nb); + counts = (MPI_Offset**)NCI_Malloc(sizeof(MPI_Offset*) * nb); + // Iterate through all chunks involved + i = 0; + cbsize = 0; + memcpy(ccord, cstart, sizeof(int) * varp->ndim); + for(i = 0; i < nb; i++){ + j = get_chunk_idx(varp, ccord); + bidx[i] = j; // chunk idx + cbsize += varp->data_lens[j]; // total buffer size of compressed data + starts[i] = varp->chunk_index + j; // start of the chunk + counts[i] = varp->data_lens + j; // count of the chunk + + // move on to next chunk + ccord[varp->ndim - 1]++; + for(j = varp->ndim - 1; j > 0; j--){ + if (ccord[j] >= cend[j]){ + ccord[j - 1]++; + ccord[j] = cstart[j]; + } + } + } + + // Allocate buffers + cbuffer = (char*)NCI_Malloc(cbsize); // Compressed data + + // Locate data var + err = ncchkp->driver->get_var(ncchkp->ncp, varp->varid, NULL, NULL, NULL, NULL, &datavarid, 1, MPI_INT, reqMode); + if (err != NC_NOERR) return err; + + // read compressed data + err = ncchkp->driver->get_varn(ncchkp->ncp, datavarid, nb, starts, counts, cbuffer, cbsize, MPI_BYTE, reqMode); + if (err != NC_NOERR) return err; + + // Decompression + + // Calculate chunk size + // Original datatype + err = ncchkp->driver->get_att(ncchkp->ncp, varp->varid, "_datatype", &xtype, MPI_INT); + if (err != NC_NOERR) return err; + + // Calculate chunk size + bsize = (int)NC_Type_size(xtype); + for(i = 0; i < varp->ndim; i++){ + bsize *= varp->chunkdim[i]; + } + + // Allocate buffers + rbuffer = NCI_Malloc(bsize * nb); // Decompressed data + + // Decompress chunks + cbsize = 0; + for(i = 0; i < nb; i++){ + j = bidx[i]; + if (varp->data_lens[j] > 0){ + varp->filter_driver->decompress(cbuffer + cbsize, varp->data_lens[j], rbuffer + bsize * i, NULL, varp->ndim, varp->dimsize, ncmpii_nc2mpitype(xtype)); + } + else{ + memset(rbuffer + bsize * i, 0, bsize); + } + cbsize += varp->data_lens[j]; // move to next chunk location + } + + // Copy data into user buffer + + // Create datatype of querying domain in the decompressed domain + tsize = NCI_Malloc(sizeof(int) * varp->ndim); + tssize = NCI_Malloc(sizeof(int) * varp->ndim); + tstart = NCI_Malloc(sizeof(int) * varp->ndim); + for(i = 0; i < varp->ndim; i++){ + tsize[i] = (cend[i] - cstart[i]) * varp->chunkdim[i]; + tssize[i] = (int)count[i]; + tstart[i] = start[i] % varp->chunkdim[i]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY(varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, ncmpii_nc2mpitype(xtype), &subarytype); + CHK_ERR_TYPE_COMMIT(&subarytype); + + // Pack data into user buffer + tpos = 0; + CHK_ERR_PACK(rbuffer, bsize * nb, subarytype, buf, bsize * nb, &tpos, ncchkp->comm); + + // Free datatype + MPI_Type_free(&subarytype); + + return NC_NOERR; +} + +int +ncchkioi_put_var_old(NC_chk *ncchkp, + NC_chk_var *varp, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + void *buf) +{ + int i, j, k, err=NC_NOERR; + nc_type xtype; // Variable data type in NC + MPI_Datatype etype; // Variable element type in MPI + int esize; // Variable element size + int *cstart, *cend, *ccord; // Bounding box for chunks overlapping my own write region + int nb, bsize; //number of chunks this process write to and chunk size + int datavarid; // Id of data variable + int *tsize, *tssize, *tstart; // Size for sub-array type + int nmychunks, *mychunks; // chunk count and id this process handles + int *sendcounts, *sdispls; // Send count and displacements in buffer + int *recvcounts, *rdispls; // Receive count and displacement in buffer + int *packoff; // Offset in mpi packing + int *zipsize, *zdispls; // Compressed count and displacement of my chunks in buffer + int *zsize_local, *zsize_all; // Compressed size of all chunks at local and global (all processes) + int *zdispls_all; // Compressed displacement of all chunks (all processes) + int overlapsize; // Size of overlapping region between a chunk and write region + MPI_Datatype ptype; // Pack datatype + char *zbuf, *xbuf; // Compressed and uncompressed data buffer + char *sbuf, *rbuf; // Send and receive buffer + MPI_Offset **start_all, **count_all, **stride_all; // Start, count, stride of all processes + char name[128]; // Name of objects + int zdimid; // dimension id for compressed data variable + MPI_Offset **zstarts, **zcounts; // Starts and counts in the varn call for compressed data + + // Original datatype and size + err = ncchkp->driver->get_att(ncchkp->ncp, varp->varid, "_datatype", &xtype, MPI_INT); + if (err != NC_NOERR) return err; + esize = NC_Type_size(xtype); + etype = ncmpii_nc2mpitype(xtype); + + // Calculate chunk size + bsize = esize; + for(i = 0; i < varp->ndim; i++){ + bsize *= varp->chunkdim[i]; + } + + // Allocate buffering for overlaping index + tsize = (int*)NCI_Malloc(sizeof(int) * varp->ndim); + tssize = (int*)NCI_Malloc(sizeof(int) * varp->ndim); + tstart = (int*)NCI_Malloc(sizeof(int) * varp->ndim); + + /* + * Gather start, count, stride to all processes + */ + + // Allocate buffer + + start_all = NCI_Malloc(sizeof(MPI_Offset*) * ncchkp->np); + count_all = NCI_Malloc(sizeof(MPI_Offset*) * ncchkp->np); + stride_all = NCI_Malloc(sizeof(MPI_Offset*) * ncchkp->np); + + start_all[0] = NCI_Malloc(sizeof(MPI_Offset) * ncchkp->np * varp->ndim); + count_all[0] = NCI_Malloc(sizeof(MPI_Offset) * ncchkp->np * varp->ndim); + stride_all[0] = NCI_Malloc(sizeof(MPI_Offset) * ncchkp->np * varp->ndim); + + for(i = 1; i < ncchkp->np; i++){ + start_all[i] = start_all[0] + i * varp->ndim; + count_all[i] = count_all[0] + i * varp->ndim; + stride_all[i] = stride_all[0] + i * varp->ndim; + } + + // Call allgather + + err = MPI_Allgather(start, varp->ndim, MPI_LONG_LONG_INT, start_all[0], varp->ndim, MPI_LONG_LONG_INT, ncchkp->comm); + if (err != MPI_SUCCESS){ + err = ncmpii_error_mpi2nc(err, "MPI_Allgather"); + DEBUG_RETURN_ERROR(err); + } + + if (count != NULL){ + err = MPI_Allgather(count, varp->ndim, MPI_LONG_LONG_INT, count_all[0], varp->ndim, MPI_LONG_LONG_INT, ncchkp->comm); + if (err != MPI_SUCCESS){ + err = ncmpii_error_mpi2nc(err, "MPI_Allgather"); + DEBUG_RETURN_ERROR(err); + } + } + + if (stride != NULL){ + err = MPI_Allgather(stride, varp->ndim, MPI_LONG_LONG_INT, stride_all[0], varp->ndim, MPI_LONG_LONG_INT, ncchkp->comm); + if (err != MPI_SUCCESS){ + err = ncmpii_error_mpi2nc(err, "MPI_Allgather"); + DEBUG_RETURN_ERROR(err); + } + } + + /* + * Now, we need to send data to the chunk owner as well as receive data for our own chunk + */ + + // First, compute chunk boundary, find overlapping chunks + cstart = (int*)NCI_Malloc(sizeof(int) * varp->ndim); + ccord = (int*)NCI_Malloc(sizeof(int) * varp->ndim); + cend = (int*)NCI_Malloc(sizeof(int) * varp->ndim); + for(i = 0; i < varp->ndim; i++){ + cstart[i] = start[i] / varp->chunkdim[i]; + if (stride == NULL){ + cend[i] = (start[i] + count[i] - 1) / varp->chunkdim[i] + 1; + } + else{ + cend[i] = (start[i] + (count[i] - 1) * stride[i]) / varp->chunkdim[i] + 1; + } + } + + // Calculate the amount we need to send to other process + sendcounts = (int*)NCI_Malloc(sizeof(int) * ncchkp->np); + sdispls = (int*)NCI_Malloc(sizeof(int) * ncchkp->np); + packoff = (int*)NCI_Malloc(sizeof(int) * ncchkp->np); + memset(sendcounts, 0, sizeof(int) * ncchkp->np); + memset(packoff, 0, sizeof(int) * ncchkp->np); + + // Iterate through all chunks involved to count send size + i = 0; + overlapsize = 0; + memcpy(ccord, cstart, sizeof(int) * varp->ndim); + while(ccord[0] < cend[0]){ + j = varp->chunk_owner[get_chunk_idx(varp, ccord)]; + + // Overlapping size of this chunk + overlapsize = get_chunk_overlap(varp, ccord, start, count, stride, tstart, tssize); + sendcounts[j] += overlapsize; + + // move on to next chunk + ccord[varp->ndim - 1]++; + for(j = varp->ndim - 1; j > 0; j--){ + if (ccord[j] >= cend[j]){ + ccord[j - 1]++; + ccord[j] = cstart[j]; + } + } + } + + // Buffer displacement + sdispls[0] = 0; + for(i = 1; i < ncchkp->np; i++){ + sdispls[i] = sendcounts[i - 1] + sdispls[i - 1]; + } + + // Allocate send buffer + sbuf = (char*)NCI_Malloc(sdispls[ncchkp->np - 1] + sendcounts[ncchkp->np - 1]); + + // Pack data into send buffer + + // Iterate through all chunks involved again, this time actually pack the data + for(i = 0; i < varp->ndim; i++){ + tsize[i] = (int)count[i]; + } + i = 0; + overlapsize = 0; + memcpy(ccord, cstart, sizeof(int) * varp->ndim); + while(ccord[0] < cend[0]){ + j = varp->chunk_owner[get_chunk_idx(varp, ccord)]; + + // Overlapping region of this chunk + get_chunk_overlap(varp, ccord, start, count, stride, tstart, tssize); + for(k = 0; k < varp->ndim; k++){ + tstart[k] -= (int)start[k]; + } + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY(varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, ncmpii_nc2mpitype(xtype), &ptype); + CHK_ERR_TYPE_COMMIT(&ptype); + + // Pack data + CHK_ERR_PACK(buf, 1, ptype, sbuf + sdispls[j], sendcounts[j], packoff + j, ncchkp->comm); + + // Free datatype + MPI_Type_free(&ptype); + + // move on to next chunk + ccord[varp->ndim - 1]++; + for(j = varp->ndim - 1; j > 0; j--){ + if (ccord[j] >= cend[j]){ + ccord[j - 1]++; + ccord[j] = cstart[j]; + } + } + } + + /* + * Determine chunk ownership + * Find my chunks + */ + nmychunks = 0; + for(i = 0; i < varp->nchunk; i++){ + if (varp->chunk_owner[i] == ncchkp->rank){ + nmychunks++; + } + } + + // Gather chunk id this process handled to prevent a search in the future + mychunks = (int*)NCI_Malloc(sizeof(int) * nmychunks); + nmychunks = 0; + for(i = 0; i < varp->nchunk; i++){ + if (varp->chunk_owner[i] == ncchkp->rank){ + mychunks[nmychunks] = i; + nmychunks++; + } + } + + /* + * Compute size to receive + * We only need size here, packing will happen after receving + */ + + // Calculate the amount we need to receive from other process + recvcounts = (int*)NCI_Malloc(sizeof(int) * ncchkp->np); + rdispls = (int*)NCI_Malloc(sizeof(int) * ncchkp->np); + memset(recvcounts, 0, sizeof(int) * ncchkp->np); + memset(packoff, 0, sizeof(int) * ncchkp->np); + for(i = 0; i < varp->nchunk; i++){ + if (varp->chunk_owner[i] == ncchkp->rank){ + get_chunk_cord(varp, i, ccord); + + for(j = 0; j < ncchkp->np; j++){ + // Overlapping region of this chunk + get_chunk_overlap(varp, ccord, start_all[j], count_all[j], stride_all[j], tstart, tssize); + + overlapsize = esize; + for(k = 0; k < varp->ndim; k++){ + overlapsize *= tssize[k]; + } + recvcounts[j] += overlapsize; + } + } + } + + // Buffer displacement + rdispls[0] = 0; + for(i = 1; i < ncchkp->np; i++){ + rdispls[i] = recvcounts[i - 1] + rdispls[i - 1]; + } + + // Allocate receive buffer + rbuf = (char*)NCI_Malloc(rdispls[ncchkp->np - 1] + recvcounts[ncchkp->np - 1]); + + // Send the data to destination + MPI_Alltoallv(sbuf, sendcounts, sdispls, MPI_BYTE, rbuf, recvcounts, rdispls, MPI_BYTE, ncchkp->comm); + +/* +#ifdef PNETCDF_DEBUG + if (ncchkp->rank == 0){ + printf("Rank %d: sendcount = {", ncchkp->rank); + for(i = 0; i < ncchkp->np; i++){ + printf("%d, ", sendcounts[i]); + } + printf("}, sdispls = {"); + for(i = 0; i < ncchkp->np; i++){ + printf("%d, ", sdispls[i]); + } + printf("}, recvcounts = {"); + for(i = 0; i < ncchkp->np; i++){ + printf("%d, ", recvcounts[i]); + } + printf("}, rdispls = {"); + for(i = 0; i < ncchkp->np; i++){ + printf("%d, ", rdispls[i]); + } + printf("}, sbuf = {"); + for(i = 0; i < sdispls[ncchkp->np - 1] + sendcounts[ncchkp->np - 1]; i++){ + printf("%x ", sbuf[i]); + } + printf("}, rbuf = {"); + for(i = 0; i < rdispls[ncchkp->np - 1] + recvcounts[ncchkp->np - 1]; i++){ + printf("%x ", rbuf[i]); + } + printf("}\n"); + fflush(stdout); + } +#endif +*/ + + /* + * Next step is to pack data to chunk buffer + */ + + // Allocate buffer + xbuf = (char*)NCI_Malloc(nmychunks * bsize); + + // Main array is the whole chunk + for(i = 0; i < varp->ndim; i++){ + tsize[i] = varp->chunkdim[i]; + } + + // Pack data + memset(packoff, 0, sizeof(int) * ncchkp->np); + for(i = 0; i < nmychunks; i++){ + get_chunk_cord(varp, mychunks[i], ccord); + + for(j = 0; j < ncchkp->np; j++){ + // Overlapping region of this chunk + overlapsize = get_chunk_overlap(varp, ccord, start_all[j], count_all[j], stride_all[j], tstart, tssize); + + if (overlapsize > 0){ + // Overlap size + //overlapsize = esize; + //for(k = 0; k < varp->ndim; k++){ + // overlapsize *= tssize[k]; + //} + + // The chunk is the main array, overlapping region is the subarray + for(k = 0; k < varp->ndim; k++){ + tstart[k] -= ccord[k] * varp->chunkdim[k]; + } + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY(varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, ncmpii_nc2mpitype(xtype), &ptype); + CHK_ERR_TYPE_COMMIT(&ptype); + + // Pack data + CHK_ERR_UNPACK(rbuf + rdispls[j], overlapsize, packoff + j, xbuf + bsize * i, 1, ptype, ncchkp->comm); + + // Free datatype + MPI_Type_free(&ptype); + } + } + } + +/* +#ifdef PNETCDF_DEBUG + if (ncchkp->rank == 0){ + printf("Rank %d: xbuf = {", ncchkp->rank); + for(i = 0; i < nmychunks * bsize; i++){ + printf("%x ", xbuf[i]); + } + printf("}\n"); + fflush(stdout); + } +#endif +*/ + + /* + * The buffer is now filled with data coming from all processes, it's time to compress + */ + + // compressed size and displacement + zipsize = (int*)NCI_Malloc(sizeof(int) * nmychunks); + zdispls = (int*)NCI_Malloc(sizeof(int) * (nmychunks + 1)); + memset(zipsize, 0, sizeof(int) * nmychunks); + memset(zdispls, 0, sizeof(int) * (nmychunks + 1)); + + // Calculate compressed data size + for(i = 0; i < nmychunks; i++){ + // Calculate compressed size + // This is just estimate + varp->filter_driver->compress(xbuf + bsize * i, bsize, NULL, zipsize + i, varp->ndim, varp->chunkdim, etype); + } + + // Calculate total size + for(i = 1; i < nmychunks; i++){ + zdispls[0] += zipsize[i]; + } + + // Allocate buffer + zbuf = (char*)NCI_Malloc(zdispls[0]); + + // Perform real compression + for(i = 0; i < nmychunks; i++){ + // Compressed the data + // We get real size here + varp->filter_driver->compress(xbuf + bsize * i, bsize, zbuf + zdispls[i], zipsize + i, varp->ndim, varp->chunkdim, etype); + + // Calculate offset + zdispls[i + 1] = zdispls[i] + zipsize[i]; + } + +/* +#ifdef PNETCDF_DEBUG + if (ncchkp->rank == 0){ + printf("Rank %d: zipsize = {", ncchkp->rank); + for(i = 0; i < nmychunks; i++){ + printf("%x ", zipsize[i]); + } + printf("}, zdispls = {"); + for(i = 0; i < nmychunks; i++){ + printf("%d, ", zdispls[i]); + } + printf("}, zbuf = {"); + for(i = 0; i < zdispls[nmychunks- 1] + zipsize[nmychunks - 1]; i++){ + printf("%x ", zbuf[i]); + } + printf("}\n"); + fflush(stdout); + } +#endif +*/ + + /* + * Now it is time for a collective write + * We start by syncing compressed size on all processes + * Then, we can create variable large enough to store compressed data + * Finally, we do collective write to store the data + */ + + // First sync on compressed chunk size + // We use a all MAX reduce on all chunks + // An alternative is to allgather and unpack the info + + // Allocate buffer + zsize_local = (int*)NCI_Malloc(sizeof(int) * varp->nchunk); + zsize_all = (int*)NCI_Malloc(sizeof(int) * varp->nchunk); + zdispls_all = (int*)NCI_Malloc(sizeof(int) * varp->nchunk); + memset(zsize_local, 0, sizeof(int) * varp->nchunk); + memset(zsize_all, 0, sizeof(int) * varp->nchunk); + memset(zdispls_all, 0, sizeof(int) * varp->nchunk); + + // Fill up local size + for(i = 0; i < nmychunks; i++){ + zsize_local[mychunks[i]] = zipsize[i]; + } + + // All reduce + CHK_ERR_ALLREDUCE(zsize_local, zsize_all, varp->nchunk, MPI_INT, MPI_MAX, ncchkp->comm); + + // Calculate variable displacement + zdispls_all[0] = 0; + for(i = 1; i < varp->nchunk; i++){ + zdispls_all[i] = zsize_all[i - 1] + zdispls_all[i - 1]; + } + +/* +#ifdef PNETCDF_DEBUG + if (ncchkp->rank == 0){ + printf("Rank %d: zsize_all = {", ncchkp->rank); + for(i = 0; i < varp->nchunk; i++){ + printf("%x ", zsize_all[i]); + } + printf("}, zdispls_all = {"); + for(i = 0; i < varp->nchunk; i++){ + printf("%d, ", zdispls_all[i]); + } + printf("}, varid = { %d", varp->varid); + printf("}, datavarid = { %d", varp->datavarid); + printf("}\n"); + fflush(stdout); + } +#endif +*/ + + // Enter redefine mode + ncchkp->driver->redef(ncchkp->ncp); + + // Define dimension for data variable + sprintf(name, "_compressed_data_dim_%d", varp->varid); + err = ncchkp->driver->def_dim(ncchkp->ncp, name, zdispls_all[varp->nchunk - 1] + zsize_all[varp->nchunk - 1], &zdimid); + if (err != NC_NOERR) return err; + + // Define variable + sprintf(name, "_compressed_data_%d", varp->varid); + err = ncchkp->driver->def_var(ncchkp->ncp, name, NC_BYTE, 1, &zdimid, &(varp->datavarid)); + if (err != NC_NOERR) return err; + + // Record offset in data variable + err = ncchkp->driver->put_att(ncchkp->ncp, varp->varid, "_chunkoffset", NC_INT, varp->nchunk, zdispls_all, MPI_INT); // Original datatype + if (err != NC_NOERR) return err; + + // Switch to data mode + err = ncchkp->driver->enddef(ncchkp->ncp); + if (err != NC_NOERR) return err; + + //Now, we generate a varn call to write out compressed data + zstarts = (MPI_Offset**)NCI_Malloc(sizeof(MPI_Offset*) * nmychunks); + zcounts = (MPI_Offset**)NCI_Malloc(sizeof(MPI_Offset*) * nmychunks); + zstarts[0] = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * nmychunks); + zcounts[0] = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * nmychunks); + for(i = 0; i < nmychunks; i++){ + zstarts[i] = zstarts[0] + i; + zcounts[i] = zcounts[0] + i; + zstarts[i][0] = zdispls_all[mychunks[i]]; + zcounts[i][0] = zsize_all[mychunks[i]]; + } + err = ncchkp->driver->put_varn(ncchkp->ncp, varp->datavarid, nmychunks, zstarts, zcounts, zbuf, zdispls[nmychunks - 1] + zipsize[nmychunks - 1], MPI_UNSIGNED_CHAR, NC_REQ_WR | NC_REQ_BLK | NC_REQ_FLEX | NC_REQ_COLL); + if (err != NC_NOERR) return err; + + // Record datavar id + err = ncchkp->driver->put_var(ncchkp->ncp, varp->varid, NULL, NULL, NULL, NULL, &(varp->datavarid), 1, MPI_INT, NC_REQ_WR | NC_REQ_BLK | NC_REQ_FLEX | NC_REQ_COLL); + if (err != NC_NOERR) return err; + + // Free up buffers + NCI_Free(cstart); + NCI_Free(cend); + NCI_Free(ccord); + NCI_Free(tsize); + NCI_Free(tssize); + NCI_Free(tstart); + NCI_Free(mychunks); + NCI_Free(sendcounts); + NCI_Free(sdispls); + NCI_Free(recvcounts); + NCI_Free(rdispls); + NCI_Free(packoff); + NCI_Free(zipsize); + NCI_Free(zdispls); + NCI_Free(zsize_local); + NCI_Free(zsize_all); + NCI_Free(zdispls_all); + NCI_Free(zbuf); + NCI_Free(xbuf); + NCI_Free(sbuf); + NCI_Free(rbuf); + NCI_Free(start_all[0]); + NCI_Free(count_all[0]); + NCI_Free(stride_all[0]); + NCI_Free(start_all); + NCI_Free(count_all); + NCI_Free(stride_all); + NCI_Free(zstarts[0]); + NCI_Free(zcounts[0]); + NCI_Free(zstarts); + NCI_Free(zcounts); + + return NC_NOERR; +} + +void profile(){ + /* Profiling information */ + ncchkp->profile.total_data += t9 - t0; + ncchkp->profile.total_meta += t9 - t0; + ncchkp->profile.max_buffer += t9 - t0; + ncchkp->profile.total_time += t9 - t0; + ncchkp->profile.cb_time += t9 - t0; + ncchkp->profile.io_time += t9 - t0; + + ncchkp->profile.cb_init_time += t9 - t0; // Calculate number of req + ncchkp->profile.cb_sync_time += t9 - t0; // Syncing number of req + ncchkp->profile.cb_pack_req_time += t9 - t0; // Pack request and reply + ncchkp->profile.cb_pack_rep_time += t9 - t0; // Pack request and reply + ncchkp->profile.cb_unpack_req_time += t9 - t0; // Unpack incoming request + ncchkp->profile.cb_unpack_rep_time += t9 - t0; // Unpack incoming request + ncchkp->profile.cb_send_req_time += t9 - t0; // Posting and waiting send + ncchkp->profile.cb_send_rep_time += t9 - t0; // Posting and waiting send + ncchkp->profile.cb_recv_req_time += t9 - t0; // Time posting and waiting recv + ncchkp->profile.cb_recv_rep_time += t9 - t0; // Time posting and waiting recv + ncchkp->profile.cb_self_time += t9 - t0; // Time handling our own data + + ncchkp->profile.io_wr_time += t9 - t0; + ncchkp->profile.io_rd_time += t9 - t0; + ncchkp->profile.io_com_time += t9 - t0; + ncchkp->profile.io_decom_time += t9 - t0; + ncchkp->profile.io_sync_time += t9 - t0; +} diff --git a/src/drivers/ncchunkio/ncchkioi_lists.c b/src/drivers/ncchunkio/ncchkioi_lists.c new file mode 100644 index 0000000000..d89b510aaf --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_lists.c @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2018, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include +#include +#include +#include "ncchkio_internal.h" + +int ncchkioi_var_list_init(NC_chk_var_list *list) { + list->cnt = 0; + list->nalloc = 0; + return NC_NOERR; +} + +int ncchkioi_var_list_free(NC_chk_var_list *list) { + int i; + if (list->nalloc > 0){ + for(i = 0; i < list->cnt; i++){ + ncchkioi_var_free(list->data + i); + } + NCI_Free(list->data); + } + return NC_NOERR; +} + +int ncchkioi_var_list_add(NC_chk_var_list *list) { + if (list->nalloc == 0){ + list->nalloc = 16; + list->data = NCI_Malloc(list->nalloc * sizeof(NC_chk_var)); + CHK_ALLOC(list->data) + } + else if (list->nalloc == list->cnt){ + list->nalloc *= 2; + list->data = NCI_Realloc(list->data, list->nalloc * sizeof(NC_chk_var)); + CHK_ALLOC(list->data) + } + + return ((list->cnt)++); +} diff --git a/src/drivers/ncchunkio/ncchkioi_nonblocking.c b/src/drivers/ncchunkio/ncchkioi_nonblocking.c new file mode 100644 index 0000000000..03b6dbbeb5 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_nonblocking.c @@ -0,0 +1,208 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +#define PUT_ARRAY_SIZE 128 /* Size of initial put list */ +#define SIZE_MULTIPLIER 2 /* When metadata buffer is full, we'll NCI_Reallocate it to META_BUFFER_MULTIPLIER times the original size*/ + +/* getlist is a module in ADIOS driver that manage nonblocking get request object + * It consist of a pool of request object (reqs) and request ids (ids) + * It's implemented by 3 array of the same number of entries + * The id i corresponds to the i-th request object + * We issue request object by issuing the corresponding request id + * ids is initialized with increasing id, ie. ids[i] = i + * ids are issued from the begining of ids array + * We keep track of location of id in ids array in pos array. Initially, pos[i] = i + * A pointer nused keep track of issued ids, it also marks the position of next unused ready to be issued + * ids[0:nused] => active (used) request ids + * ids[nused:nalloc] => available (unused) request ids + * When issuing an id, we take from ids from the position marked by nused and increase nused by 1 + * When recycling an id, we swap if with the right before position marked by nused and decrease nused by 1 so that it falls to unused pool + * NOTE: We does not guarantee to issue id in continuous and increasing order + * NOTE: ids is simply a pool housing reqeust ids, the position od id within ids is not fixed and has no meaning + * + * Eaxmple: + * Initial: + * ids = 0 1 2 3 + * ^ + * nused = 0 + * After issuing 2 ids: + * undefined|Avaiable ids ---> + * ids = 0 1 2 3 + * ^ + * nused = 2 + * Recycling id 0 + * |Avaiable ids ---> + * ids = 1 0 2 3 + * ^ + * nused = 1 +* Recycling id 1 + * |Avaiable ids ---> + * ids = 1 0 2 3 + * ^ + * nused = 0 + */ + +/* + * Initialize the put list + * ids[0:nused] => active (used) request ids + * ids[nused:nalloc] => available (unused) request ids + */ +int ncchkioi_req_list_init(NC_chk_req_list *lp) { + int err=NC_NOERR; + int i; + + /* Initialize parameter and allocate the array */ + lp->nused = 0; + lp->nalloc = PUT_ARRAY_SIZE; + lp->reqs = (NC_chk_req*)NCI_Malloc(lp->nalloc * sizeof(NC_chk_req)); + lp->ids = (int*)NCI_Malloc(lp->nalloc * SIZEOF_INT); + CHK_PTR(lp->ids) + lp->pos = (int*)NCI_Malloc(lp->nalloc * SIZEOF_INT); + CHK_PTR(lp->pos) + if (lp->reqs == NULL || lp->ids == NULL) { + DEBUG_RETURN_ERROR(NC_ENOMEM); + } + + /* Initialize values of ids and reqs + * Assign increasing unique id + */ + for (i=0; inalloc; i++) { + lp->ids[i] = i; // Unique ids + lp->pos[i] = i; // Not in use + } + +err_out:; + return err; +} + +/* + * Enlarge the put list + * When there are no more unused ids to issue, we must add more ids to the pool + * We simply enlarge ids and reqs array + * We initialize the extended part as usual + */ +static int ncchkioi_req_list_resize(NC_chk_req_list *lp) +{ + int i; + size_t nsize; + void *ptr; + + /* Calculate new size */ + nsize = lp->nalloc * SIZE_MULTIPLIER; + + /* Realloc reqs and ids */ + ptr = NCI_Realloc(lp->reqs, nsize * sizeof(NC_chk_req)); + if (ptr == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM); + lp->reqs = (NC_chk_req*)ptr; + + ptr = NCI_Realloc(lp->ids, nsize * SIZEOF_INT); + if (ptr == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM); + lp->ids = (int*)ptr; + + ptr = NCI_Realloc(lp->pos, nsize * SIZEOF_INT); + if (ptr == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM); + lp->pos = (int*)ptr; + + /* Initialize values of ids and reqs + * Assign increasing unique id + */ + for (i=lp->nalloc; iids[i] = i; // Unique ids + lp->pos[i] = i; // Default position + } + + lp->nalloc = nsize; + + return NC_NOERR; +} + +/* + * Clean up the put list + */ +int ncchkioi_req_list_free(NC_chk_req_list *lp) +{ + NCI_Free(lp->reqs); + NCI_Free(lp->ids); + NCI_Free(lp->pos); + + return NC_NOERR; +} + +/* + * Allocate a new request object from the getlist with id + * We first check if there are unused ids + * We increase the size of pool, bringing in new ids if there aren't + * Then we issue the ids at position nused and increase it by 1 + */ +int ncchkioi_req_list_add(NC_chk_req_list *lp, int *id) +{ + int err=NC_NOERR; + + /* Increase size if necessary */ + if (lp->nused == lp->nalloc) { + err = ncchkioi_req_list_resize(lp); + if (err != NC_NOERR) return err; + } + + /* Get the first unused id marked by nused */ + *id = lp->ids[lp->nused++]; + + return NC_NOERR; +} + +/* + * Recycle a request object in the put list + * We need to maintain the position of each request id in the ids list + * ids[0:nused] => active (used) request ids + * ids[nused:nalloc] => available (unused) request ids + */ +int ncchkioi_req_list_remove(NC_chk_req_list *lp, int reqid) { + NC_chk_req * req = lp->reqs + reqid; + + /* Clean up request */ + if (req->start != NULL){ + NCI_Free(req->start); + } + if (req->count != NULL){ + NCI_Free(req->count); + } + if (req->starts != NULL){ + NCI_Free(req->starts); + } + if (req->counts != NULL){ + NCI_Free(req->counts); + } + if (req->stride != NULL){ + NCI_Free(req->stride); + } + if (req->xbufs != NULL){ + NCI_Free(req->xbufs); + } + if (req->xbuf != req->buf){ + NCI_Free(req->xbuf); + } + + /* Return id to the list */ + lp->nused--; + lp->ids[lp->pos[reqid]] = lp->ids[lp->nused]; + lp->pos[lp->ids[lp->nused]] = lp->pos[reqid]; + lp->ids[lp->nused] = reqid; + lp->pos[reqid] = lp->nused; + + return NC_NOERR; +} diff --git a/src/drivers/ncchunkio/ncchkioi_profile.m4 b/src/drivers/ncchunkio/ncchkioi_profile.m4 new file mode 100644 index 0000000000..93f7a8157d --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_profile.m4 @@ -0,0 +1,98 @@ +dnl Process this m4 file to produce 'C' language file. +dnl +dnl If you see this line, you can ignore the next one. +/* Do not edit this file. It is produced from the corresponding .m4 source */ +dnl +/* + * Copyright (C) 2021, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ +dnl +include(`foreach.m4')`'dnl +include(`foreach_idx.m4')`'dnl +include(`list_len.m4')`'dnl +include(`utils.m4')`'dnl +include(`ncchkioi_profile_timers.m4')`'dnl +define(`upcase', `translit(`$*', `a-z', `A-Z')')`'dnl +define(`CONCATE',`$1$2')`'dnl +changecom(`##', `')`'dnl +dnl +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include + +#include +#include +#include +#include "ncchkio_internal.h" +#include "ncchkioi_profile.h" + +/* + * Report performance profiling + */ +#ifdef PNETCDF_PROFILING + +static double tmax[NC_CHK_NTIMER], tmin[NC_CHK_NTIMER], tmean[NC_CHK_NTIMER], tvar[NC_CHK_NTIMER], tvar_local[NC_CHK_NTIMER]; + +const char * const tname[NC_CHK_NTIMER]={ +foreach(`t', NC_CHK_TIMERS, `"CONCATE(`nc_chk_timer_', t)", +')dnl +}; + +void ncchkioi_profile_add_time (NC_chk *ncchkp, int id, double t) { + assert (id >= 0 && id < NC_CHK_NTIMER); + ncchkp->profile.tt[id] += t; + ncchkp->profile.cnt[id]++; +} + +void ncchkioi_print_profile(NC_chk *ncchkp){ + int i; + + MPI_Reduce (ncchkp->profile.tt, tmax, NC_CHK_NTIMER, MPI_DOUBLE, MPI_MAX, 0, ncchkp->comm); + MPI_Reduce (ncchkp->profile.tt, tmin, NC_CHK_NTIMER, MPI_DOUBLE, MPI_MIN, 0, ncchkp->comm); + MPI_Allreduce (ncchkp->profile.tt, tmean, NC_CHK_NTIMER, MPI_DOUBLE, MPI_SUM, ncchkp->comm); + for (i = 0; i < NC_CHK_NTIMER; i++) { + tmean[i] /= ncchkp->np; + tvar_local[i] = (ncchkp->profile.tt[i] - tmean[i]) * (ncchkp->profile.tt[i] - tmean[i]); + } + MPI_Reduce (tvar_local, tvar, NC_CHK_NTIMER, MPI_DOUBLE, MPI_SUM, 0, ncchkp->comm); + + if (ncchkp->rank == 0) { + for (i = 0; i < NC_CHK_NTIMER; i++) { + printf ("#%%$: %s_time_mean: %lf\n", tname[i], tmean[i]); + printf ("#%%$: %s_time_max: %lf\n", tname[i], tmax[i]); + printf ("#%%$: %s_time_min: %lf\n", tname[i], tmin[i]); + printf ("#%%$: %s_time_var: %lf\n\n", tname[i], tvar[i]); + } + } + + MPI_Reduce (ncchkp->profile.cnt, tmax, NC_CHK_NTIMER, MPI_DOUBLE, MPI_MAX, 0, ncchkp->comm); + MPI_Reduce (ncchkp->profile.cnt, tmin, NC_CHK_NTIMER, MPI_DOUBLE, MPI_MIN, 0, ncchkp->comm); + MPI_Allreduce (ncchkp->profile.cnt, tmean, NC_CHK_NTIMER, MPI_DOUBLE, MPI_SUM, ncchkp->comm); + for (i = 0; i < NC_CHK_NTIMER; i++) { + tmean[i] /= ncchkp->np; + tvar_local[i] = (ncchkp->profile.cnt[i] - tmean[i]) * (ncchkp->profile.cnt[i] - tmean[i]); + } + MPI_Reduce (tvar_local, tvar, NC_CHK_NTIMER, MPI_DOUBLE, MPI_SUM, 0, ncchkp->comm); + + if (ncchkp->rank == 0) { + for (i = 0; i < NC_CHK_NTIMER; i++) { + printf ("#%%$: %s_count_mean: %lf\n", tname[i], tmean[i]); + printf ("#%%$: %s_count_max: %lf\n", tname[i], tmax[i]); + printf ("#%%$: %s_count_min: %lf\n", tname[i], tmin[i]); + printf ("#%%$: %s_count_var: %lf\n\n", tname[i], tvar[i]); + } + } +} +#endif + + + + diff --git a/src/drivers/ncchunkio/ncchkioi_profile.m4h b/src/drivers/ncchunkio/ncchkioi_profile.m4h new file mode 100644 index 0000000000..a07fbb0f2b --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_profile.m4h @@ -0,0 +1,75 @@ +dnl Process this m4 file to produce 'C' language file. +dnl +dnl If you see this line, you can ignore the next one. +/* Do not edit this file. It is produced from the corresponding .m4 source */ +dnl +/* + * Copyright (C) 2021, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ +dnl +include(`foreach.m4')`'dnl +include(`foreach_idx.m4')`'dnl +include(`list_len.m4')`'dnl +include(`utils.m4')`'dnl +include(`ncchkioi_profile_timers.m4')`'dnl +define(`upcase', `translit(`$*', `a-z', `A-Z')')`'dnl +define(`CONCATE',`$1$2')`'dnl +changecom(`##', `')`'dnl +dnl +#pragma once + +#ifdef HAVE_CONFIG_H +#include +#endif + +/* + * Report performance profiling + */ +#ifdef PNETCDF_PROFILING + +#define NC_CHK_NTIMER list_len(NC_CHK_TIMERS) + +foreach_idx(`t', `i', NC_CHK_TIMERS, `#define CONCATE(`NC_CHK_TIMER_', upcase(t)) i +')dnl + +#define NC_CHK_TIMER_START(A) ncchkp->profile.st[A] = MPI_Wtime(); +#define NC_CHK_TIMER_PAUSE(A) { \ + ncchkp->profile.tt[A] += MPI_Wtime() - ncchkp->profile.st[A]; \ +} +#define NC_CHK_TIMER_STOP(A) { \ + NC_CHK_TIMER_PAUSE(A) \ + ncchkp->profile.cnt[A] ++; \ +} +#define NC_CHK_TIMER_SWAP(A, B) { \ + double tmp = MPI_Wtime(); \ + ncchkp->profile.tt[A] += tmp - ncchkp->profile.st[A]; \ + ncchkp->profile.cnt[A] ++; \ + ncchkp->profile.st[B] = tmp; \ +} +#define NC_CHK_TIMER_STOPEX(A, B) { \ + double tmp = MPI_Wtime(); \ + ncchkp->profile.tt[A] += tmp - ncchkp->profile.st[A]; \ + ncchkp->profile.cnt[A] ++; \ + ncchkp->profile.tt[B] -= tmp - ncchkp->profile.st[A]; \ +} + +dnl +typedef struct NC_chk_timers { + /* Profiling information */ + double st[NC_CHK_NTIMER]; + double tt[NC_CHK_NTIMER]; + double cnt[NC_CHK_NTIMER]; +} NC_chk_timers; + +#else + +#define NC_CHK_TIMER_START(A) +#define NC_CHK_TIMER_STOP(A) +#define NC_CHK_TIMER_PAUSE(A) +#define NC_CHK_TIMER_SWAP(A, B) +#define NC_CHK_TIMER_STOPEX(A, B) + +#endif + diff --git a/src/drivers/ncchunkio/ncchkioi_profile_timers.m4 b/src/drivers/ncchunkio/ncchkioi_profile_timers.m4 new file mode 100644 index 0000000000..27652837b9 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_profile_timers.m4 @@ -0,0 +1,73 @@ +define(`NC_CHK_TIMERS', `( `total', dnl + `var_init', dnl + `var_init_meta', dnl + `var_init_csize', dnl + `var_init_cown', dnl + `var_resize', dnl + `put', dnl + `put_cb', dnl + `put_cb_init', dnl + `put_cb_sync', dnl + `put_cb_pack_req', dnl + `put_cb_pack_rep', dnl + `put_cb_unpack_req', dnl + `put_cb_unpack_rep', dnl + `put_cb_send_req', dnl + `put_cb_send_rep', dnl + `put_cb_recv_req', dnl + `put_cb_recv_rep', dnl + `put_cb_self', dnl + `put_cb_barr', dnl + `put_bg', dnl + `put_bg_init', dnl + `put_bg_cache', dnl + `put_bg_rd', dnl + `put_bg_decom', dnl + `put_io', dnl + `put_io_init', dnl + `put_io_com', dnl + `put_io_sync', dnl + `put_io_wr', dnl + `put_io_barr', dnl + `get', dnl + `get_resize', dnl + `get_cb', dnl + `get_cb_init', dnl + `get_cb_sync', dnl + `get_cb_pack_req', dnl + `get_cb_pack_rep', dnl + `get_cb_unpack_req', dnl + `get_cb_unpack_rep', dnl + `get_cb_send_req', dnl + `get_cb_send_rep', dnl + `get_cb_recv_req', dnl + `get_cb_recv_rep', dnl + `get_cb_self', dnl + `get_cb_barr', dnl + `get_io', dnl + `get_io_init', dnl + `get_io_cache', dnl + `get_io_rd', dnl + `get_io_decom', dnl + `get_convert', dnl + `finalize', dnl + `finalize_meta', dnl + `iput', dnl + `iget', dnl + `wait', dnl + `wait_put', dnl + `wait_put_barr', dnl + `wait_get', dnl + `put_size', dnl + `get_size', dnl + `send_size', dnl + `recv_size', dnl + `nsend', dnl + `nrecv', dnl + `nremote', dnl + `nreq', dnl + `nlocal', dnl + `nchunk', dnl + `var_size', dnl + `var_zsize', dnl +)')`'dnl \ No newline at end of file diff --git a/src/drivers/ncchunkio/ncchkioi_put_var.c b/src/drivers/ncchunkio/ncchkioi_put_var.c new file mode 100644 index 0000000000..528736bb3b --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_put_var.c @@ -0,0 +1,766 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +int ncchkioi_put_var_cb_chunk (NC_chk *ncchkp, + NC_chk_var *varp, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + void *buf) { + int err=NC_NOERR; + int i, j; + int cid; // Chunk iterator + + MPI_Offset *ostart, *osize; + int *tsize, *tssize, *tstart, *tsizep, *tstartp; // Size for sub-array type + MPI_Offset *citr; + + int *wcnt_local, *wcnt_all; // Number of processes that writes to each chunk + + int nread; // Chunks to read for background + int *rids; + + int overlapsize; // Size of overlaping region of request and chunk + int max_tbuf = 0; // Size of intermediate buffer + char *tbuf = NULL; // Intermediate buffer + + int packoff; // Pack offset + MPI_Datatype ptype; // Pack datatype + + int nsend, nrecv; // Number of send and receive + MPI_Request *sreqs, *rreqs; // Send and recv req + MPI_Status *sstats, *rstats; // Send and recv status + char **sbufs, **rbufs; // Send and recv buffer + int *rsizes; // recv size of each message + MPI_Message rmsg; // Receive message + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + + // Allocate buffering for write count + wcnt_local = (int *)NCI_Malloc (sizeof (int) * varp->nchunk * 2); + wcnt_all = wcnt_local + varp->nchunk; + + // Allocate buffering for overlaping index + tstart = (int *)NCI_Malloc (sizeof (int) * varp->ndim * 3); + tsize = tstart + varp->ndim; + tssize = tsize + varp->ndim; + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * 3); + osize = ostart + varp->ndim; + + // Chunk iterator + citr = osize + varp->ndim; + + // We need to calculate the size of message of each chunk + // This is just for allocating send buffer + // We do so by iterating through all request and all chunks they cover + // If we are not the owner of a chunk, we need to send message + memset (wcnt_local, 0, sizeof (int) * varp->nchunk); + nsend = 0; + + // Iterate through chunks + ncchkioi_chunk_itr_init (varp, start, count, citr, &cid); // Initialize chunk iterator + do { + if (varp->chunk_owner[cid] != ncchkp->rank) { + // Count number of mnessage we need to send + nsend++; + wcnt_local[cid] = 1; + } else { + // We mark covered chunk of our own to prevent unnecessary calculation of overlap + // -1 is purely a mark, we need to add 1 back to global message count + wcnt_local[cid] = -1; + max_tbuf = varp->chunksize; + } + } while (ncchkioi_chunk_itr_next (varp, start, count, citr, &cid)); + + // Allocate buffer for sending + sbufs = (char **)NCI_Malloc (sizeof (char *) * nsend); + sreqs = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nsend); + sstats = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * nsend); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SYNC) + + // Sync number of messages of each chunk + CHK_ERR_ALLREDUCE (wcnt_local, wcnt_all, varp->nchunk, MPI_INT, MPI_SUM, ncchkp->comm); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SYNC) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_PACK_REQ) + + // Calculate number of recv request + // This is for all the chunks + nrecv = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + // We don't need message for our own data + nrecv += wcnt_all[cid] - wcnt_local[cid]; + } + rreqs = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nrecv); + rstats = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * nrecv); + rbufs = (char **)NCI_Malloc (sizeof (char *) * nrecv); + rsizes = (int *)NCI_Malloc (sizeof (int) * nrecv); + + // Post send + nsend = 0; + // Iterate through chunks + ncchkioi_chunk_itr_init_ex (varp, start, count, citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // We got something to send if we are not owner + if (varp->chunk_owner[cid] != ncchkp->rank) { + // Calculate chunk overlap + overlapsize = varp->esize; + for (j = 0; j < varp->ndim; j++) { overlapsize *= osize[j]; } + + // Allocate buffer + sbufs[nsend] = (char *)NCI_Malloc (overlapsize + sizeof (int) * varp->ndim * 2); + + // Metadata + packoff = 0; + tstartp = (int *)sbufs[nsend]; + packoff += varp->ndim * sizeof (int); + tsizep = (int *)(sbufs[nsend] + packoff); + packoff += varp->ndim * sizeof (int); + for (j = 0; j < varp->ndim; j++) { + tstartp[j] = (int)(ostart[j] - citr[j]); + tsizep[j] = (int)osize[j]; + } + + // Pack type + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - start[j]); + tsize[j] = (int)count[j]; + tssize[j] = (int)osize[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data + CHK_ERR_PACK (buf, 1, ptype, sbufs[nsend], packoff + overlapsize, &packoff, MPI_COMM_SELF); + + MPI_Type_free (&ptype); + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + // Send the request + CHK_ERR_ISEND (sbufs[nsend], packoff, MPI_BYTE, varp->chunk_owner[cid], cid, + ncchkp->comm, sreqs + nsend); + + NC_CHK_TIMER_STOPEX (NC_CHK_TIMER_PUT_CB_SEND_REQ, NC_CHK_TIMER_PUT_CB_PACK_REQ) + nsend++; + } + } while (ncchkioi_chunk_itr_next_ex (varp, start, count, citr, &cid, ostart, osize)); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_PACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + // Post recv + nrecv = 0; + for (j = 0; j < varp->nmychunk; j++) { + cid = varp->mychunks[j]; + // We are the owner of the chunk + // Receive data from other process + for (i = 0; i < wcnt_all[cid] - wcnt_local[cid]; i++) { + // Get message size, including metadata + CHK_ERR_MPROBE (MPI_ANY_SOURCE, cid, ncchkp->comm, &rmsg, rstats); + CHK_ERR_GET_COUNT (rstats, MPI_BYTE, rsizes + nrecv); + + // Allocate buffer + rbufs[nrecv] = (char *)NCI_Malloc (rsizes[nrecv]); + + // Post irecv + CHK_ERR_IMRECV (rbufs[nrecv], rsizes[nrecv], MPI_BYTE, &rmsg, rreqs + nrecv); + + nrecv++; + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_RECV_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + // Wait for all send + CHK_ERR_WAITALL (nsend, sreqs, sstats); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SEND_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + + // Preparing chunk cache + nread = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + if (wcnt_all[cid] && varp->chunk_cache[cid] == NULL) { + if (varp->chunk_index[cid].len > 0) { nread++; } + } + } + rids = (int *)NCI_Malloc (sizeof (int) * nread); + nread = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + if (wcnt_all[cid] || wcnt_local[cid]) { + if (varp->chunk_cache[cid] == NULL) { + err = ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (NC_chk_cache*)NCI_Malloc(varp->chunksize); + if (varp->chunk_index[cid].len > 0) { rids[nread++] = cid; } + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + } + // Increase batch number to indicate allocated chunk buffer can be freed for future allocation + (ncchkp->cache_serial)++; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_INIT) + + // Read background + ncchkioi_load_var_bg (ncchkp, varp, nread, rids); + + // Allocate intermediate buffer + if (max_tbuf > 0) { tbuf = (char *)NCI_Malloc (max_tbuf); } + + // For each chunk we own, we need to receive incoming data + nrecv = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SELF) + + // Handle our own data first if we have any + if (wcnt_local[cid] < 0) { + // Convert chunk id to iterator + get_chunk_itr (varp, cid, citr); + + // Calculate overlapping region + overlapsize = get_chunk_overlap (varp, citr, start, count, ostart, osize); + + if (overlapsize > 0) { + // Pack type from user buffer to (contiguous) intermediate buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - start[j]); + tsize[j] = (int)count[j]; + tssize[j] = (int)osize[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into intermediate buffer + packoff = 0; + CHK_ERR_PACK (buf, 1, ptype, tbuf, varp->chunksize, &packoff, MPI_COMM_SELF); + overlapsize = packoff; + + MPI_Type_free (&ptype); + + // Pack type from (contiguous) intermediate buffer to chunk buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - citr[j]); + tsize[j] = varp->chunkdim[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Unpack data into chunk buffer + packoff = 0; + CHK_ERR_UNPACK (tbuf, overlapsize, &packoff, varp->chunk_cache[cid]->buf, 1, ptype, MPI_COMM_SELF); + + MPI_Type_free (&ptype); + + // Mark chunk as dirty + varp->dirty[cid] = 1; + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SELF) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + // Now, it is time to process data from other processes + + // Wait for all send requests related to this chunk + // We remove the impact of -1 mark in wcnt_local[cid] + CHK_ERR_WAITALL (wcnt_all[cid] - wcnt_local[cid], rreqs + nrecv, rstats + nrecv); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_RECV_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_UNPACK_REQ) + + // Process data received + for (j = nrecv; j < nrecv + wcnt_all[cid] - wcnt_local[cid]; j++) { + // Metadata + packoff = 0; + tstartp = (int *)rbufs[j]; + packoff += varp->ndim * sizeof (int); + tsizep = (int *)(rbufs[j] + packoff); + packoff += varp->ndim * sizeof (int); + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, varp->chunkdim, tsizep, tstartp, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Data + CHK_ERR_UNPACK (rbufs[j], rsizes[j], &packoff, varp->chunk_cache[cid]->buf, 1, ptype, MPI_COMM_SELF); + MPI_Type_free (&ptype); + + // Mark chunk as dirty + varp->dirty[cid] = 1; + } + nrecv += wcnt_all[cid] - wcnt_local[cid]; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_UNPACK_REQ) + } + + // Free buffers + NCI_Free (wcnt_local); + + NCI_Free (tstart); + + NCI_Free (ostart); + + NCI_Free (sreqs); + NCI_Free (sstats); + for (i = 0; i < nsend; i++) { NCI_Free (sbufs[i]); } + NCI_Free (sbufs); + + NCI_Free (rreqs); + NCI_Free (rstats); + for (i = 0; i < nrecv; i++) { NCI_Free (rbufs[i]); } + NCI_Free (rbufs); + NCI_Free (rsizes); + + if (tbuf != NULL) { NCI_Free (tbuf); } + + if (rids != NULL) { NCI_Free (rids); } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB) + +err_out:; + return err; +} + +int ncchkioi_put_var_cb_proc (NC_chk *ncchkp, + NC_chk_var *varp, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + void *buf) { + int err=NC_NOERR; + int i, j, k; + int cid, cown; // Chunk iterator + + MPI_Offset *ostart = NULL, *osize; + int *tsize, *tssize, *tstart = NULL, *tssizep, *tstartp; // Size for sub-array type + MPI_Offset *citr; // Bounding box for chunks overlapping my own write region + + int *wcnt_local = NULL, *wcnt_all; // Number of processes that writes to each chunk + int wrange_local[2], wrange_all[2]; // Number of processes that writes to each chunk + + int nread; // Chunks to read for background + int *rids; + + int overlapsize; // Size of overlaping region of request and chunk + char *tbuf = NULL; // Intermediate buffer + + int packoff; // Pack offset + MPI_Datatype ptype; // Pack datatype + + int nsend, nrecv; // Number of send and receive + MPI_Request *sreq = NULL, *rreq = NULL; // Send and recv req + MPI_Status *sstat = NULL, rstat; // Send and recv status + char **sbuf = NULL, **sbufp, **rbuf = NULL, **rbufp; // Send and recv buffer + int *rsize = NULL, *ssize = NULL; // recv size of each message + int *sdst; // recv size of each message + int *smap; + size_t bsize; + MPI_Message rmsg; // Receive message + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + + // Allocate buffering for write count + wcnt_local = (int *)NCI_Malloc (sizeof (int) * ncchkp->np * 3); + CHK_PTR (wcnt_local) + wcnt_all = wcnt_local + ncchkp->np; + smap = wcnt_all + ncchkp->np; + + // Allocate buffering for overlaping index + tstart = (int *)NCI_Malloc (sizeof (int) * varp->ndim * 3); + CHK_PTR (tstart) + tssize = tstart + varp->ndim; + tsize = tssize + varp->ndim; + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * 3); + CHK_PTR (ostart) + osize = ostart + varp->ndim; + + // Chunk iterator + citr = osize + varp->ndim; + + // We need to calculate the size of message of each chunk + // This is just for allocating send buffer + // We do so by iterating through all request and all chunks they cover + // If we are not the owner of a chunk, we need to send message + memset (wcnt_local, 0, sizeof (int) * ncchkp->np); + nsend = 0; + + // Count total number of messages and build a map of accessed chunk to list of comm + // datastructure + wrange_local[0] = varp->nchunk; + wrange_local[1] = 0; + ncchkioi_chunk_itr_init (varp, start, count, citr, &cid); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + + // Mapping to skip list of send requests + if (wcnt_local[cown] == 0 && cown != ncchkp->rank) { smap[cown] = nsend++; } + wcnt_local[cown] = 1; // Need to send message if not owner + + // Record lowest and highest chunk accessed + if (wrange_local[0] > cid) { wrange_local[0] = cid; } + if (wrange_local[1] < cid) { wrange_local[1] = cid; } + } while (ncchkioi_chunk_itr_next (varp, start, count, citr, &cid)); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SYNC) + + // Sync number of messages of each chunk and access range + CHK_ERR_ALLREDUCE (wcnt_local, wcnt_all, ncchkp->np, MPI_INT, MPI_SUM, ncchkp->comm); + wrange_local[1] *= -1; + CHK_ERR_ALLREDUCE (wrange_local, wrange_all, 2, MPI_INT, MPI_MIN, ncchkp->comm); + nrecv = wcnt_all[ncchkp->rank] - + wcnt_local[ncchkp->rank]; // We don't need to receive request from self + wrange_all[1] *= -1; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SYNC) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_PACK_REQ) + + // Allocate data structure for messaging + sbuf = (char **)NCI_Malloc (sizeof (char *) * nsend * 2); + CHK_PTR (sbuf) + sbufp = sbuf + nsend; + ssize = (int *)NCI_Malloc (sizeof (int) * nsend * 2); + CHK_PTR (ssize) + sdst = ssize + nsend; + sreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nsend); + CHK_PTR (sreq) + sstat = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * nsend); + CHK_PTR (sstat) + + rbuf = (char **)NCI_Malloc (sizeof (char *) * nrecv * 2); + CHK_PTR (rbuf) + rbufp = rbuf + nrecv; + rsize = (int *)NCI_Malloc (sizeof (int) * nrecv); + CHK_PTR (rsize) + rreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nrecv); + CHK_PTR (rreq) + + // Count size of each request + memset (ssize, 0, sizeof (int) * nsend); + ncchkioi_chunk_itr_init_ex (varp, start, count, citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + sdst[j] = cown; // Record a reverse map by the way + + // Count overlap + overlapsize = varp->esize; + for (i = 0; i < varp->ndim; i++) { overlapsize *= osize[i]; } + ssize[j] += overlapsize + sizeof (int) * (varp->ndim * 2 + 1); + } + } while (ncchkioi_chunk_itr_next_ex (varp, start, count, citr, &cid, ostart, osize)); + + // Allocate buffer for send + bsize = 0; + for (i = 0; i < nsend; i++) { bsize += ssize[i]; } + if (nsend > 0) { + sbuf[0] = sbufp[0] = (char *)NCI_Malloc (bsize); + CHK_PTR (sbuf[0]) + for (i = 1; i < nsend; i++) + sbuf[i] = sbufp[i] = sbuf[i-1] + ssize[i-1]; + } + + // Pack requests + ncchkioi_chunk_itr_init_ex (varp, start, count, citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + + // Metadata + *((int *)sbufp[j]) = cid; + sbufp[j] += sizeof (int); + tstartp = (int *)sbufp[j]; + sbufp[j] += varp->ndim * sizeof (int); + tssizep = (int *)sbufp[j]; + sbufp[j] += varp->ndim * sizeof (int); + for (i = 0; i < varp->ndim; i++) { + tstartp[i] = (int)(ostart[i] - citr[i]); + tssizep[i] = (int)osize[i]; + } + + // Pack type + for (i = 0; i < varp->ndim; i++) { + tstart[i] = (int)(ostart[i] - start[i]); + tsize[i] = (int)count[i]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Data + packoff = 0; + int outsize = ssize[j] - sizeof(int) * (varp->ndim * 2 + 1); + CHK_ERR_PACK (buf, 1, ptype, sbufp[j], outsize, &packoff, MPI_COMM_SELF); + sbufp[j] += packoff; + MPI_Type_free (&ptype); + } + } while (ncchkioi_chunk_itr_next_ex (varp, start, count, citr, &cid, ostart, osize)); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_PACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + // Post send + for (i = 0; i < nsend; i++) { + CHK_ERR_ISEND (sbuf[i], ssize[i], MPI_BYTE, sdst[i], 0, ncchkp->comm, sreq + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SEND_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + // Post recv + for (i = 0; i < nrecv; i++) { + // Get message size, including metadata + CHK_ERR_MPROBE (MPI_ANY_SOURCE, 0, ncchkp->comm, &rmsg, &rstat); + CHK_ERR_GET_COUNT (&rstat, MPI_BYTE, rsize + i); + + // Allocate buffer + rbuf[i] = rbufp[i] = (char *)NCI_Malloc (rsize[i]); + CHK_PTR (rbuf[i]) + + // Post irecv + CHK_ERR_IMRECV (rbuf[i], rsize[i], MPI_BYTE, &rmsg, rreq + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_RECV_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + + // Preparing chunk cache + for (j = 0; j < varp->nmychunk && varp->mychunks[j] < wrange_all[0]; j++) + ; + for (k = j; k < varp->nmychunk && varp->mychunks[k] <= wrange_all[1]; k++) + ; + rids = (int *)NCI_Malloc (sizeof (int) * (k - j)); + nread = 0; + for (i = j; i < k; i++) { + cid = varp->mychunks[i]; + if (varp->chunk_cache[cid] == NULL) { + err = ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + if (varp->chunk_index[cid].len > 0) { rids[nread++] = cid; } + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + // Increase batch number to indicate allocated chunk buffer can be freed for future allocation + (ncchkp->cache_serial)++; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_INIT) + + // Read background + err = ncchkioi_load_var_bg (ncchkp, varp, nread, rids); + CHK_ERR + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SELF) + + tbuf = (char *)NCI_Malloc (varp->chunksize); + CHK_PTR (tbuf) + + // Handle our own data + ncchkioi_chunk_itr_init_ex (varp, start, count, citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + if (varp->chunk_owner[cid] == ncchkp->rank) { + // Pack type from user buffer to (contiguous) intermediate buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - start[j]); + tsize[j] = (int)count[j]; + tssize[j] = (int)osize[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into intermediate buffer + packoff = 0; + CHK_ERR_PACK (buf, 1, ptype, tbuf, varp->chunksize, &packoff, MPI_COMM_SELF); + MPI_Type_free (&ptype); + overlapsize = packoff; + + // Pack type from (contiguous) intermediate buffer to chunk buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - citr[j]); + tsize[j] = varp->chunkdim[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Unpack data into chunk buffer + packoff = 0; + CHK_ERR_UNPACK (tbuf, overlapsize, &packoff, varp->chunk_cache[cid]->buf, 1, ptype, MPI_COMM_SELF); + MPI_Type_free (&ptype); + + // Mark chunk as dirty + varp->dirty[cid] = 1; + } + } while (ncchkioi_chunk_itr_next_ex (varp, start, count, citr, &cid, ostart, osize)); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SELF) + + // Handle incoming requests + for (i = 0; i < varp->ndim; i++) { tsize[i] = varp->chunkdim[i]; } + for (i = 0; i < nrecv; i++) { + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + // Will wait any provide any benefit? + MPI_Waitany (nrecv, rreq, &j, &rstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_RECV_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_UNPACK_REQ) + + while (rbufp[j] < rbuf[j] + rsize[j]) { + // Metadata + cid = *(int *)(rbufp[j]); + rbufp[j] += sizeof (int); + tstartp = (int *)rbufp[j]; + rbufp[j] += varp->ndim * sizeof (int); + tssizep = (int *)rbufp[j]; + rbufp[j] += varp->ndim * sizeof (int); + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstartp, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Data + packoff = 0; + CHK_ERR_UNPACK (rbufp[j], rsize[j], &packoff, varp->chunk_cache[cid]->buf, 1, ptype, MPI_COMM_SELF); + rbufp[j] += packoff; + MPI_Type_free (&ptype); + + // Mark chunk as dirty + varp->dirty[cid] = 1; + } + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_UNPACK_REQ) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + CHK_ERR_WAITALL (nsend, sreq, sstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SEND_REQ) + +err_out:; + + // Free buffers + NCI_Free (wcnt_local); + + NCI_Free (tstart); + + NCI_Free (ostart); + + NCI_Free (sreq); + NCI_Free (sstat); + NCI_Free (ssize); + if (nsend > 0) { NCI_Free (sbuf[0]); } + NCI_Free (sbuf); + + NCI_Free (rreq); + for (i = 0; i < nrecv; i++) { NCI_Free (rbuf[i]); } + NCI_Free (rbuf); + NCI_Free (rsize); + + NCI_Free (tbuf); + + NCI_Free (rids); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB) + + return err; +} + +int ncchkioi_put_var (NC_chk *ncchkp, + NC_chk_var *varp, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + void *buf) { + int err=NC_NOERR; + + if (varp->isrec) { + if (ncchkp->recsize < start[0] + count[0]) { ncchkp->recsize = start[0] + count[0]; } + CHK_ERR_ALLREDUCE (MPI_IN_PLACE, &(ncchkp->recsize), 1, MPI_LONG_LONG, MPI_MAX, + ncchkp->comm); // Sync number of recs + if (varp->dimsize[0] < ncchkp->recsize) { + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_PUT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_RESIZE) + + err = ncchkioi_var_resize (ncchkp, varp); + CHK_ERR + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_RESIZE) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT) + } + } + + // Collective buffer + switch (ncchkp->comm_unit) { + case NC_CHK_COMM_CHUNK: + err = ncchkioi_put_var_cb_chunk (ncchkp, varp, start, count, stride, buf); + break; + case NC_CHK_COMM_PROC: + err = ncchkioi_put_var_cb_proc (ncchkp, varp, start, count, stride, buf); + break; + } + CHK_ERR + + // Write the compressed variable + err = ncchkioi_save_var (ncchkp, varp); + CHK_ERR + +err_out:; + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_put_varn.c b/src/drivers/ncchunkio/ncchkioi_put_varn.c new file mode 100644 index 0000000000..94bb60887f --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_put_varn.c @@ -0,0 +1,810 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +int ncchkioi_put_varn_cb_chunk (NC_chk *ncchkp, + NC_chk_var *varp, + int nreq, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + MPI_Offset *const *strides, + void **bufs) { + int err=NC_NOERR; + int i, j; + int cid, req; // Chunk and request iterator + + int *tsize, *tssize, *tstart, *tsizep, *tstartp; // Size for sub-array type + MPI_Offset *ostart, *osize; + MPI_Offset *citr; + + int *wcnt_local, *wcnt_all; // Number of processes that writes to each chunk + + int nread; // Chunks to read for background + int *rids; + + int overlapsize; // Size of overlaping region of request and chunk + int max_tbuf; // Size of intermediate buffer + char *tbuf = NULL; // Intermediate buffer + + int packoff; // Pack offset + MPI_Datatype ptype; // Pack datatype + + int nsend, nrecv; // Number of send and receive + MPI_Request *sreqs, *rreqs; // Send and recv req + MPI_Status *sstats, *rstats; // Send and recv status + char **sbufs, **rbufs; // Send and recv buffer + int *rsizes; // recv size of each message + MPI_Message rmsg; // Receive message + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + + // Allocate buffering for write count + wcnt_local = (int *)NCI_Malloc (sizeof (int) * varp->nchunk * 2); + wcnt_all = wcnt_local + varp->nchunk; + + // Allocate buffering for overlaping index + tstart = (int *)NCI_Malloc (sizeof (int) * varp->ndim * 3); + tsize = tstart + varp->ndim; + tssize = tsize + varp->ndim; + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * 3); + osize = ostart + varp->ndim; + + // Chunk iterator + citr = osize + varp->ndim; + + // We need to calculate the size of message of each chunk + // This is just for allocating send buffer + // We do so by iterating through all request and all chunks they cover + // If we are not the owner of a chunk, we need to send message + memset (wcnt_local, 0, sizeof (int) * varp->nchunk); + nsend = 0; + max_tbuf = 0; + for (req = 0; req < nreq; req++) { + // Initialize chunk iterator + ncchkioi_chunk_itr_init_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize); // Initialize chunk iterator + + // Iterate through chunks + do { + // Calculate overlapping + overlapsize = varp->esize; + for (j = 0; j < varp->ndim; j++) { overlapsize *= osize[j]; } + + if (varp->chunk_owner[cid] != ncchkp->rank) { + // Count number of mnessage we need to send + if (wcnt_local[cid] == 0) { nsend++; } + wcnt_local[cid] += overlapsize + sizeof (int) * 2 * varp->ndim; + } else { + // We mark covered chunk of our own to prevent unnecessary calculation of overlap + // -1 is purely a mark, we need to add 1 back to global message count + wcnt_local[cid] = -1; + + // Record max overlapsize so we know how large the intermediate buffer is needed + // later + if (max_tbuf < overlapsize) { max_tbuf = overlapsize; } + } + + } while ( + ncchkioi_chunk_itr_next_ex (varp, starts[req], counts[req], citr, &cid, ostart, osize)); + } + + // Allocate buffer for sending + sbufs = (char **)NCI_Malloc (sizeof (char *) * nsend); + sreqs = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nsend); + sstats = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * nsend); + j = 0; + // Allocate buffer for data + for (cid = 0; cid < varp->nchunk; cid++) { + // Count number of mnessage we need to send + if (wcnt_local[cid] > 0) { + // Add space for number of reqs + sbufs[j++] = (char *)NCI_Malloc (wcnt_local[cid]); + // We don't need message size anymore, wcnt_local is used to track number of message + // from now on + wcnt_local[cid] = 1; + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SYNC) + + // Sync number of messages of each chunk + CHK_ERR_ALLREDUCE (wcnt_local, wcnt_all, varp->nchunk, MPI_INT, MPI_SUM, ncchkp->comm); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SYNC) + + // Calculate number of recv request + // This is for all the chunks + nrecv = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + // We don't need message for our own data + nrecv += wcnt_all[cid] - wcnt_local[cid]; + } + rreqs = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nrecv); + rstats = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * nrecv); + rbufs = (char **)NCI_Malloc (sizeof (char *) * nrecv); + rsizes = (int *)NCI_Malloc (sizeof (int) * nrecv); + + // Post send and recv + nrecv = 0; + nsend = 0; + for (cid = 0; cid < varp->nchunk; cid++) { + if (varp->chunk_owner[cid] == ncchkp->rank) { + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + // We are the owner of the chunk + // Receive data from other process + for (i = 0; i < wcnt_all[cid] - wcnt_local[cid]; i++) { + // Get message size, including metadata + CHK_ERR_MPROBE (MPI_ANY_SOURCE, cid, ncchkp->comm, &rmsg, rstats); + CHK_ERR_GET_COUNT (rstats, MPI_BYTE, rsizes + nrecv); + + // Allocate buffer + rbufs[nrecv] = (char *)NCI_Malloc (rsizes[nrecv]); + + // Post irecv + CHK_ERR_IMRECV (rbufs[nrecv], rsizes[nrecv], MPI_BYTE, &rmsg, rreqs + nrecv); + nrecv++; + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_RECV_REQ) + } else { + // If we any of our request overlap with this chunk, we need to send data + // We send only 1 message for 1 chunk + if (wcnt_local[cid] > 0) { + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_PACK_REQ) + + packoff = 0; + // Get chunk iterator + get_chunk_itr (varp, cid, citr); + for (req = 0; req < nreq; req++) { + // Calculate chunk overlap + overlapsize = + get_chunk_overlap (varp, citr, starts[req], counts[req], ostart, osize); + + // If current request have any overlap with the chunk, we pack the data and + // metadata + if (overlapsize > 0) { + // Metadata + tstartp = (int *)(sbufs[nsend] + packoff); + packoff += varp->ndim * sizeof (int); + tsizep = (int *)(sbufs[nsend] + packoff); + packoff += varp->ndim * sizeof (int); + for (j = 0; j < varp->ndim; j++) { + tstartp[j] = (int)(ostart[j] - citr[j]); + tsizep[j] = (int)osize[j]; + } + + // Pack type + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - starts[req][j]); + tsize[j] = (int)counts[req][j]; + tssize[j] = (int)osize[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, + MPI_ORDER_C, varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Data + CHK_ERR_PACK (bufs[req], 1, ptype, sbufs[nsend], packoff + overlapsize, + &packoff, ncchkp->comm); + MPI_Type_free (&ptype); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_PACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + // Send the request + CHK_ERR_ISEND (sbufs[nsend], packoff, MPI_BYTE, varp->chunk_owner[cid], cid, + ncchkp->comm, sreqs + nsend); + nsend++; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SEND_REQ) + } + } + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + // Wait for all send + CHK_ERR_WAITALL (nsend, sreqs, sstats); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SEND_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + + // Preparing chunk cache + nread = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + if (wcnt_all[cid] && varp->chunk_cache[cid] == NULL) { + if (varp->chunk_index[cid].len > 0) { nread++; } + } + } + rids = (int *)NCI_Malloc (sizeof (int) * nread); + nread = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + if (wcnt_all[cid] || wcnt_local[cid]) { + if (varp->chunk_cache[cid] == NULL) { + err = ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (NC_chk_cache*)NCI_Malloc(varp->chunksize); + if (varp->chunk_index[cid].len > 0) { rids[nread++] = cid; } + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + } + // Increase batch number to indicate allocated chunk buffer can be freed for future allocation + (ncchkp->cache_serial)++; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_INIT) + + // Read background + ncchkioi_load_var_bg (ncchkp, varp, nread, rids); + + // Allocate intermediate buffer + if (max_tbuf > 0) { tbuf = (char *)NCI_Malloc (max_tbuf); } + + // For each chunk we own, we need to receive incoming data + nrecv = 0; + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SELF) + + // Handle our own data first if we have any + if (wcnt_local[cid] < 0) { + for (req = 0; req < nreq; req++) { + // Convert chunk id to iterator + get_chunk_itr (varp, cid, citr); + + // Calculate overlapping region + overlapsize = + get_chunk_overlap (varp, citr, starts[req], counts[req], ostart, osize); + + // If anything overlaps + if (overlapsize > 0) { + // Pack type from user buffer to (contiguous) intermediate buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - starts[req][j]); + tsize[j] = (int)counts[req][j]; + tssize[j] = (int)osize[j]; + } + + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into intermediate buffer + packoff = 0; + CHK_ERR_PACK (bufs[req], 1, ptype, tbuf, overlapsize, &packoff, ncchkp->comm); + + MPI_Type_free (&ptype); + + // Pack type from (contiguous) intermediate buffer to chunk buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - citr[j]); + tsize[j] = varp->chunkdim[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Unpack data into chunk buffer + packoff = 0; + CHK_ERR_UNPACK (tbuf, overlapsize, &packoff, varp->chunk_cache[cid]->buf, 1, + ptype, ncchkp->comm); + + MPI_Type_free (&ptype); + + // Mark chunk as dirty + varp->dirty[cid] = 1; + } + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SELF) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + // Now, it is time to process data from other processes + + // Wait for all send requests related to this chunk + // We remove the impact of -1 mark in wcnt_local[cid] + CHK_ERR_WAITALL (wcnt_all[cid] - wcnt_local[cid], rreqs + nrecv, rstats + nrecv); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_RECV_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_UNPACK_REQ) + + // Process data received + for (j = nrecv; j < nrecv + wcnt_all[cid] - wcnt_local[cid]; j++) { + packoff = 0; + while (packoff < rsizes[j]) { + // Metadata + tstartp = (int *)(rbufs[j] + packoff); + packoff += varp->ndim * sizeof (int); + tsizep = (int *)(rbufs[j] + packoff); + packoff += varp->ndim * sizeof (int); + + // Packtype + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, varp->chunkdim, tsizep, tstartp, + MPI_ORDER_C, varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Data + CHK_ERR_UNPACK (rbufs[j], rsizes[j], &packoff, varp->chunk_cache[cid]->buf, 1, + ptype, ncchkp->comm); + MPI_Type_free (&ptype); + + // Mark chunk as dirty + varp->dirty[cid] = 1; + } + } + nrecv += wcnt_all[cid] - wcnt_local[cid]; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_UNPACK_REQ) + } + + // Free buffers + NCI_Free (wcnt_local); + + NCI_Free (tstart); + + NCI_Free (ostart); + + NCI_Free (sreqs); + NCI_Free (sstats); + for (i = 0; i < nsend; i++) { NCI_Free (sbufs[i]); } + NCI_Free (sbufs); + + NCI_Free (rreqs); + NCI_Free (rstats); + for (i = 0; i < nrecv; i++) { NCI_Free (rbufs[i]); } + NCI_Free (rbufs); + NCI_Free (rsizes); + + if (tbuf != NULL) { NCI_Free (tbuf); } + + if (rids != NULL) { NCI_Free (rids); } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB) + +err_out:; + return err; +} + +int ncchkioi_put_varn_cb_proc (NC_chk *ncchkp, + NC_chk_var *varp, + int nreq, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + void **bufs) { + int err=NC_NOERR; + int i, j, k; + int cid, cown; // Chunk iterator and owner + int req; + + MPI_Offset *ostart, *osize; + int *tsize, *tssize, *tstart, *tssizep, *tstartp; // Size for sub-array type + MPI_Offset *citr; // Bounding box for chunks overlapping my own write region + + int *wcnt_local, *wcnt_all; // Number of processes that writes to each chunk + int wrange_local[2], wrange_all[2]; // Number of processes that writes to each chunk + + int nread; // Chunks to read for background + int *rids; + + int overlapsize; // Size of overlaping region of request and chunk + char *tbuf = NULL; // Intermediate buffer + + int packoff; // Pack offset + MPI_Datatype ptype; // Pack datatype + + int nsend, nrecv; // Number of send and receive + MPI_Request *sreq, *rreq; // Send and recv req + MPI_Status *sstat, rstat; // Send and recv status + char **sbuf, **sbufp, **rbuf, **rbufp; // Send and recv buffer + int *rsize, *ssize; // recv size of each message + int *sdst; // recv size of each message + int *smap; + MPI_Message rmsg; // Receive message + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + + // Allocate buffering for write count + wcnt_local = (int *)NCI_Malloc (sizeof (int) * ncchkp->np * 3); + wcnt_all = wcnt_local + ncchkp->np; + smap = wcnt_all + ncchkp->np; + + // Allocate buffering for overlaping index + tstart = (int *)NCI_Malloc (sizeof (int) * varp->ndim * 3); + tssize = tstart + varp->ndim; + tsize = tssize + varp->ndim; + ostart = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * varp->ndim * 3); + osize = ostart + varp->ndim; + + // Chunk iterator + citr = osize + varp->ndim; + + // We need to calculate the size of message of each processes + // This is just for allocating send buffer + // We do so by iterating through all request and all chunks they cover + // If we are not the owner of a chunk, we need to send message + memset (wcnt_local, 0, sizeof (int) * ncchkp->np); + nsend = 0; + + // Count total number of messages and build a map of accessed chunk to list of comm + // datastructure + wrange_local[0] = varp->nchunk; + wrange_local[1] = 0; + for (req = 0; req < nreq; req++) { + ncchkioi_chunk_itr_init (varp, starts[req], counts[req], citr, + &cid); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + + // Mapping to skip list of send requests + if (wcnt_local[cown] == 0 && cown != ncchkp->rank) { smap[cown] = nsend++; } + wcnt_local[cown] = 1; // Need to send message if not owner + + // Record lowest and highest chunk accessed + if (wrange_local[0] > cid) { wrange_local[0] = cid; } + if (wrange_local[1] < cid) { wrange_local[1] = cid; } + } while (ncchkioi_chunk_itr_next (varp, starts[req], counts[req], citr, &cid)); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SYNC) + + // Sync number of messages of each chunk + CHK_ERR_ALLREDUCE (wcnt_local, wcnt_all, ncchkp->np, MPI_INT, MPI_SUM, ncchkp->comm); + wrange_local[1] *= -1; + CHK_ERR_ALLREDUCE (wrange_local, wrange_all, 2, MPI_INT, MPI_MIN, ncchkp->comm); + nrecv = wcnt_all[ncchkp->rank] - + wcnt_local[ncchkp->rank]; // We don't need to receive request form self + wrange_all[1] *= -1; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SYNC) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_PACK_REQ) + + // Allocate data structure for messaging + sbuf = (char **)NCI_Malloc (sizeof (char *) * nsend * 2); + sbufp = sbuf + nsend; + ssize = (int *)NCI_Malloc (sizeof (int) * nsend * 2); + sdst = ssize + nsend; + sreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nsend); + sstat = (MPI_Status *)NCI_Malloc (sizeof (MPI_Status) * nsend); + + rbuf = (char **)NCI_Malloc (sizeof (char *) * nrecv * 2); + rbufp = rbuf + nrecv; + rsize = (int *)NCI_Malloc (sizeof (int) * nrecv); + rreq = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nrecv); + + // Count size of each request + memset (ssize, 0, sizeof (int) * nsend); + for (req = 0; req < nreq; req++) { + ncchkioi_chunk_itr_init_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + sdst[j] = cown; // Record a reverse map by the way + + // Count overlap + overlapsize = varp->esize; + for (i = 0; i < varp->ndim; i++) { overlapsize *= osize[i]; } + ssize[j] += overlapsize + sizeof (int) * (varp->ndim * 2 + 1); + } + } while ( + ncchkioi_chunk_itr_next_ex (varp, starts[req], counts[req], citr, &cid, ostart, osize)); + } + // Allocate buffer for send + for (i = 0; i < nsend; i++) { sbuf[i] = sbufp[i] = (char *)NCI_Malloc (ssize[i]); } + + // Pack requests + for (req = 0; req < nreq; req++) { + ncchkioi_chunk_itr_init_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + // Chunk owner + cown = varp->chunk_owner[cid]; + if (cown != ncchkp->rank) { + j = smap[cown]; + + // Metadata + *((int *)sbufp[j]) = cid; + sbufp[j] += sizeof (int); + tstartp = (int *)sbufp[j]; + sbufp[j] += varp->ndim * sizeof (int); + tssizep = (int *)sbufp[j]; + sbufp[j] += varp->ndim * sizeof (int); + for (i = 0; i < varp->ndim; i++) { + tstartp[i] = (int)(ostart[i] - citr[i]); + tssizep[i] = (int)osize[i]; + } + + // Pack type + for (i = 0; i < varp->ndim; i++) { + tstart[i] = (int)(ostart[i] - starts[req][i]); + tsize[i] = (int)counts[req][i]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Data + packoff = 0; + CHK_ERR_PACK (bufs[req], 1, ptype, sbufp[j], ssize[j], &packoff, ncchkp->comm); + sbufp[j] += packoff; + MPI_Type_free (&ptype); + } + } while ( + ncchkioi_chunk_itr_next_ex (varp, starts[req], counts[req], citr, &cid, ostart, osize)); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_PACK_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + // Post send + for (i = 0; i < nsend; i++) { + CHK_ERR_ISEND (sbuf[i], ssize[i], MPI_BYTE, sdst[i], 0, ncchkp->comm, sreq + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SEND_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + // Post recv + for (i = 0; i < nrecv; i++) { + // Get message size, including metadata + CHK_ERR_MPROBE (MPI_ANY_SOURCE, 0, ncchkp->comm, &rmsg, &rstat); + CHK_ERR_GET_COUNT (&rstat, MPI_BYTE, rsize + i); + + // Allocate buffer + rbuf[i] = rbufp[i] = (char *)NCI_Malloc (rsize[i]); + + // Post irecv + CHK_ERR_IMRECV (rbuf[i], rsize[i], MPI_BYTE, &rmsg, rreq + i); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_RECV_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_INIT) + + // Preparing chunk cache + for (j = 0; j < varp->nmychunk && varp->mychunks[j] < wrange_all[0]; j++) + ; + for (k = j; k < varp->nmychunk && varp->mychunks[k] <= wrange_all[1]; k++) + ; + rids = (int *)NCI_Malloc (sizeof (int) * (k - j)); + nread = 0; + for (i = j; i < k; i++) { + cid = varp->mychunks[i]; + if (varp->chunk_cache[cid] == NULL) { + err = ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + if (varp->chunk_index[cid].len > 0) { rids[nread++] = cid; } + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + // Increase batch number to indicate allocated chunk buffer can be freed for future allocation + (ncchkp->cache_serial)++; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_INIT) + + // Read background + ncchkioi_load_var_bg (ncchkp, varp, nread, rids); + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SELF) + + tbuf = (char *)NCI_Malloc (varp->chunksize); + + // Handle our own data + for (req = 0; req < nreq; req++) { + ncchkioi_chunk_itr_init_ex (varp, starts[req], counts[req], citr, &cid, ostart, + osize); // Initialize chunk iterator + do { + if (varp->chunk_owner[cid] == ncchkp->rank) { + // Pack type from user buffer to (contiguous) intermediate buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - starts[req][j]); + tsize[j] = (int)counts[req][j]; + tssize[j] = (int)osize[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Pack data into intermediate buffer + packoff = 0; + CHK_ERR_PACK (bufs[req], 1, ptype, tbuf, varp->chunksize, &packoff, ncchkp->comm); + MPI_Type_free (&ptype); + overlapsize = packoff; + + // Pack type from (contiguous) intermediate buffer to chunk buffer + for (j = 0; j < varp->ndim; j++) { + tstart[j] = (int)(ostart[j] - citr[j]); + tsize[j] = varp->chunkdim[j]; + } + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssize, tstart, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Unpack data into chunk buffer + packoff = 0; + CHK_ERR_UNPACK (tbuf, overlapsize, &packoff, varp->chunk_cache[cid]->buf, 1, ptype, + ncchkp->comm); + MPI_Type_free (&ptype); + + // Mark chunk as dirty + varp->dirty[cid] = 1; + } + } while ( + ncchkioi_chunk_itr_next_ex (varp, starts[req], counts[req], citr, &cid, ostart, osize)); + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SELF) + + // Handle incoming requests + for (i = 0; i < varp->ndim; i++) { tsize[i] = varp->chunkdim[i]; } + for (i = 0; i < nrecv; i++) { + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_RECV_REQ) + + // Will wait any provide any benefit? + MPI_Waitany (nrecv, rreq, &j, &rstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_RECV_REQ) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_UNPACK_REQ) + + while (rbufp[j] < rbuf[j] + rsize[j]) { + // Metadata + cid = *(int *)(rbufp[j]); + rbufp[j] += sizeof (int); + tstartp = (int *)rbufp[j]; + rbufp[j] += varp->ndim * sizeof (int); + tssizep = (int *)rbufp[j]; + rbufp[j] += varp->ndim * sizeof (int); + + // Pack type + CHK_ERR_TYPE_CREATE_SUBARRAY (varp->ndim, tsize, tssizep, tstartp, MPI_ORDER_C, + varp->etype, &ptype); + CHK_ERR_TYPE_COMMIT (&ptype); + + // Data + packoff = 0; + CHK_ERR_UNPACK (rbufp[j], rsize[j], &packoff, varp->chunk_cache[cid]->buf, 1, ptype, + ncchkp->comm); + rbufp[j] += packoff; + MPI_Type_free (&ptype); + + // Mark chunk as dirty + varp->dirty[cid] = 1; + } + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_UNPACK_REQ) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + CHK_ERR_WAITALL (nsend, sreq, sstat); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB_SEND_REQ) + + // Free buffers + NCI_Free (wcnt_local); + + NCI_Free (tstart); + + NCI_Free (ostart); + + NCI_Free (sreq); + NCI_Free (sstat); + NCI_Free (ssize); + for (i = 0; i < nsend; i++) { NCI_Free (sbuf[i]); } + NCI_Free (sbuf); + + NCI_Free (rreq); + for (i = 0; i < nrecv; i++) { NCI_Free (rbuf[i]); } + NCI_Free (rbuf); + NCI_Free (rsize); + + NCI_Free (tbuf); + + NCI_Free (rids); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_CB) + +err_out:; + return err; +} + +int ncchkioi_put_varn (NC_chk *ncchkp, + NC_chk_var *varp, + int nreq, + MPI_Offset *const *starts, + MPI_Offset *const *counts, + const void *buf) { + int err=NC_NOERR; + int i, j; + MPI_Offset rsize; + char *bptr = (char *)buf; + char **bufs; + + if (varp->isrec) { + for (i = 0; i < nreq; i++) { + if (ncchkp->recsize < starts[i][0] + counts[i][0]) { + ncchkp->recsize = starts[i][0] + counts[i][0]; + } + } + CHK_ERR_ALLREDUCE (MPI_IN_PLACE, &(ncchkp->recsize), 1, MPI_LONG_LONG, MPI_MAX, + ncchkp->comm); // Sync number of recs + if (varp->dimsize[0] < ncchkp->recsize) { + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_PUT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_RESIZE) + + ncchkioi_var_resize (ncchkp, varp); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_RESIZE) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT) + } + } + + // Calculate buffer offset of each request + bufs = (char **)NCI_Malloc (sizeof (char *) * nreq); + for (i = 0; i < nreq; i++) { + bufs[i] = bptr; + rsize = varp->esize; + for (j = 0; j < varp->ndim; j++) { rsize *= counts[i][j]; } + bptr += rsize; + } + + // Collective buffer + switch (ncchkp->comm_unit) { + case NC_CHK_COMM_CHUNK: + ncchkioi_put_varn_cb_chunk (ncchkp, varp, nreq, starts, counts, NULL, (void **)bufs); + break; + case NC_CHK_COMM_PROC: + ncchkioi_put_varn_cb_proc (ncchkp, varp, nreq, starts, counts, (void **)bufs); + break; + } + + // Write the compressed variable + ncchkioi_save_var (ncchkp, varp); + +err_out:; + NCI_Free (bufs); + + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_util.c b/src/drivers/ncchunkio/ncchkioi_util.c new file mode 100644 index 0000000000..30d0ebff1a --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_util.c @@ -0,0 +1,460 @@ +/* + * Copyright (C) 2017, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +/* return internal size for values of specified netCDF type */ +MPI_Offset NC_Type_size (nc_type type) { /* netCDF type code */ + switch (type) { + case NC_BYTE: + return sizeof (char); + case NC_CHAR: + return sizeof (char); + case NC_SHORT: + return sizeof (short); + case NC_INT: + return sizeof (int); + case NC_FLOAT: + return sizeof (float); + case NC_DOUBLE: + return sizeof (double); + case NC_UBYTE: + return sizeof (unsigned char); + case NC_USHORT: + return sizeof (unsigned short); + case NC_UINT: + return sizeof (unsigned int); + case NC_INT64: + return sizeof (long long); + case NC_UINT64: + return sizeof (unsigned long long); + default: + + return 0; + } +} + +/* + * Convert NC type to MPI type + */ +MPI_Datatype ncchkioi_nc_to_mpi_type (nc_type atype) { + switch (atype) { + case NC_BYTE: + return MPI_BYTE; + case NC_CHAR: + return MPI_CHAR; + case NC_SHORT: + return MPI_SHORT; + case NC_INT: + return MPI_INT; + case NC_FLOAT: + return MPI_FLOAT; + case NC_DOUBLE: + return MPI_DOUBLE; + } + + return NC_NAT; +} + +/* + * Extract mpi hints and set up the flags + */ +int ncchkioi_extract_hint (NC_chk *ncchkp, MPI_Info info) { + int flag; + char value[MPI_MAX_INFO_VAL]; + + // Block assignment + MPI_Info_get (info, "nc_chk_block_mapping", MPI_MAX_INFO_VAL - 1, value, &flag); + if (flag) { + if (strcmp (value, "static") == 0) { + ncchkp->blockmapping = NC_CHK_MAPPING_STATIC; + } else { + printf ("Warning: Unknown mapping %s, using static\n", value); + ncchkp->blockmapping = NC_CHK_MAPPING_STATIC; + } + } else { + ncchkp->blockmapping = NC_CHK_MAPPING_STATIC; + } + + // Messaging unit + MPI_Info_get (info, "nc_chk_comm_unit", MPI_MAX_INFO_VAL - 1, value, &flag); + if (flag) { + if (strcmp (value, "chunk") == 0) { + ncchkp->comm_unit = NC_CHK_COMM_CHUNK; + } else if (strcmp (value, "proc") == 0) { + ncchkp->comm_unit = NC_CHK_COMM_PROC; + } else { + printf ("Warning: Unknown messaging unit %s, using proc\n", value); + ncchkp->comm_unit = NC_CHK_COMM_PROC; + } + } else { + ncchkp->comm_unit = NC_CHK_COMM_PROC; + } + + // Delay init + ncchkp->delay_init = 0; + MPI_Info_get (info, "nc_chk_delay_init", MPI_MAX_INFO_VAL - 1, value, &flag); + if (flag) { + if (strcmp (value, "1") == 0) { ncchkp->delay_init = 1; } + } + + // Exact chunk owner assignment + ncchkp->exact_cown = 0; + MPI_Info_get (info, "nc_chk_exact_cown", MPI_MAX_INFO_VAL - 1, value, &flag); + if (flag) { + if (strcmp (value, "1") == 0) { ncchkp->exact_cown = 1; } + } + + // Additional reserved space in file header + ncchkp->hdr_reserve = 1048576; // 1 MiB default + MPI_Info_get (info, "nc_chk_hdr_reserve", MPI_MAX_INFO_VAL - 1, value, &flag); + if (flag) { ncchkp->hdr_reserve = atoi (value); } + + // Reserve space for records + ncchkp->default_recnalloc = NC_CHK_DEFAULT_REC_ALLOC; + MPI_Info_get (info, "nc_chk_nrec", MPI_MAX_INFO_VAL - 1, value, &flag); + if (flag) { ncchkp->default_recnalloc = atoi (value); } + + ncchkp->default_recnalloc = NC_CHK_DEFAULT_REC_ALLOC; + MPI_Info_get (info, "nc_chk_nrec", MPI_MAX_INFO_VAL - 1, value, &flag); + if (flag) { ncchkp->default_recnalloc = atoi (value); } + + // Default filter + ncchkp->default_filter = NC_CHK_FILTER_NONE; + MPI_Info_get (info, "nc_chunk_default_filter", MPI_MAX_INFO_VAL - 1, value, &flag); + if (flag) { + if (strcmp (value, "none") == 0) { + ncchkp->default_filter = NC_CHK_FILTER_NONE; + } else if (strcmp (value, "dummy") == 0) { + ncchkp->default_filter = NC_CHK_FILTER_DUMMY; + } else if (strcmp (value, "zlib") == 0) { + ncchkp->default_filter = NC_CHK_FILTER_ZLIB; + } else if (strcmp (value, "sz") == 0) { + ncchkp->default_filter = NC_CHK_FILTER_SZ; + } else { + if (ncchkp->rank == 0) { printf ("Warning: Unknown filter %s, use none\n", value); } + } + } + + // Buffer size + ncchkp->cache_limit = 0; // Unlimited + ncchkp->cache_limit_hint = 0; + MPI_Info_get (info, "nc_chk_buffer_size", MPI_MAX_INFO_VAL - 1, value, &flag); + if (flag) { + sscanf (value, "%zd", &(ncchkp->cache_limit_hint)); + + if (ncchkp->cache_limit_hint > 0) { ncchkp->cache_limit = ncchkp->cache_limit_hint; } + } + + // Chunk owning size penalty + ncchkp->cown_ratio = 0.1; + MPI_Info_get (info, "nc_chk_cown_ratio", MPI_MAX_INFO_VAL - 1, value, &flag); + if (flag) { ncchkp->cown_ratio = atof (value); } + + return NC_NOERR; +} + +/* + * Export hint based on flag + * NOTE: We only set up the hint if it is not the default setting + * user hint maching the default behavior will be ignored + */ +int ncchkioi_export_hint (NC_chk *ncchkp, MPI_Info info) { + char value[MPI_MAX_INFO_VAL]; + + MPI_Info_set (info, "nc_compression", "enable"); + + switch (ncchkp->blockmapping) { + case NC_CHK_MAPPING_STATIC: + MPI_Info_set (info, "nc_chk_block_mapping", "static"); + break; + } + + switch (ncchkp->comm_unit) { + case NC_CHK_COMM_CHUNK: + MPI_Info_set (info, "nc_chk_comm_unit", "chunk"); + break; + case NC_CHK_COMM_PROC: + MPI_Info_set (info, "nc_chk_comm_unit", "proc"); + break; + } + + // Delay inint + if (ncchkp->delay_init) { + MPI_Info_set (info, "nc_chk_delay_init", "1"); + } else { + MPI_Info_set (info, "nc_chk_delay_init", "0"); + } + + // Exact cown + if (ncchkp->exact_cown) { + MPI_Info_set (info, "nc_chk_exact_cown", "1"); + } else { + MPI_Info_set (info, "nc_chk_exact_cown", "0"); + } + + // Additional reserved space in file header + sprintf (value, "%zd", ncchkp->hdr_reserve); + MPI_Info_set (info, "nc_chk_hdr_reserve", value); + + // Reserve space for records + sprintf (value, "%lld", ncchkp->default_recnalloc); + MPI_Info_set (info, "nc_chk_nrec", value); + + // Zip driver + switch (ncchkp->default_filter) { + case NC_CHK_FILTER_NONE: + MPI_Info_set (info, "nc_chk_driver", "none"); + break; + case NC_CHK_FILTER_DUMMY: + MPI_Info_set (info, "nc_chk_driver", "dummy"); + break; + case NC_CHK_FILTER_ZLIB: + MPI_Info_set (info, "nc_chk_driver", "zlib"); + break; + case NC_CHK_FILTER_SZ: + MPI_Info_set (info, "nc_chk_driver", "sz"); + break; + } + + // Buffer size + sprintf (value, "%zd", ncchkp->cache_limit); + MPI_Info_set (info, "nc_chk_buffer_size", value); + + return NC_NOERR; +} + +int ncchkioi_print_buffer_int (char *prefix, int *buf, int len) { + int i; + int rank, np; + int plen, rlen; + char *out, *outp; + char rankstr[16]; + + MPI_Comm_size (MPI_COMM_WORLD, &np); + MPI_Comm_rank (MPI_COMM_WORLD, &rank); + + rlen = sprintf (rankstr, "Rank %d: ", rank); + + plen = strlen (prefix); + out = outp = (char *)NCI_Malloc (len * 12 + 2 + plen + rlen); + + rlen = sprintf (outp, "%s ", rankstr); + outp += rlen; + plen = sprintf (outp, "%s ", prefix); + outp += plen; + for (i = 0; i < len; i++) { + plen = sprintf (outp, "%d ", buf[i]); + outp += plen; + } + + printf ("%s\n", out); + fflush (stdout); + + NCI_Free (out); + + return NC_NOERR; +} + +int ncchkioi_print_buffer_int64 (char *prefix, long long *buf, int len) { + int i; + int rank, np; + int plen, rlen; + char *out, *outp; + char rankstr[16]; + + MPI_Comm_size (MPI_COMM_WORLD, &np); + MPI_Comm_rank (MPI_COMM_WORLD, &rank); + + rlen = sprintf (rankstr, "Rank %d: ", rank); + + plen = strlen (prefix); + out = outp = (char *)NCI_Malloc (len * 18 + 2 + plen + rlen); + + rlen = sprintf (outp, "%s ", rankstr); + outp += rlen; + plen = sprintf (outp, "%s ", prefix); + outp += plen; + for (i = 0; i < len; i++) { + plen = sprintf (outp, "%lld ", buf[i]); + outp += plen; + } + + printf ("%s\n", out); + fflush (stdout); + + NCI_Free (out); + + return NC_NOERR; +} +#define NCCHKIOISWAP(V0, V1) \ + fdisps[V0] ^= fdisps[V1]; \ + fdisps[V1] ^= fdisps[V0]; \ + fdisps[V0] ^= fdisps[V1]; \ + mdisps[V0] ^= mdisps[V1]; \ + mdisps[V1] ^= mdisps[V0]; \ + mdisps[V0] ^= mdisps[V1]; \ + lens[V0] ^= lens[V1]; \ + lens[V1] ^= lens[V0]; \ + lens[V0] ^= lens[V1]; + +void ncchkioi_sort_file_offset (int len, MPI_Aint *fdisps, MPI_Aint *mdisps, int *lens) { + int i, j, p; + + if (len < 16) { + j = 1; + while (j) { + j = 0; + for (i = 0; i < len - 1; i++) { + if (fdisps[i] > fdisps[i + 1]) { + NCCHKIOISWAP (i, i + 1); + j = 1; + } + } + } + } else { + j = len / 2; + p = len - 1; + NCCHKIOISWAP (j, p); + + for (i = j = 0; i < len; i++) { + if (fdisps[i] < fdisps[p]) { + if (i != j) { NCCHKIOISWAP (i, j); } + j++; + } + } + + NCCHKIOISWAP (p, j); + + ncchkioi_sort_file_offset (j, fdisps, mdisps, lens); + ncchkioi_sort_file_offset (len - j - 1, fdisps + j + 1, mdisps + j + 1, lens + j + 1); + } +} + +int ncchkioi_subarray_off_len ( + int ndim, int *tsize, int *tssize, int *tstart, MPI_Offset *off, int *len) { + int err=NC_NOERR; + int i; + + // Try single row + err = 0; + for (i = 0; i < ndim - 1; i++) { + if (tssize[i] != 1) { + err = -1; + break; + } + } + if (err) { + // Try contiguous block + err = 0; + for (i = 1; i < ndim; i++) { + if (tssize[i] < tsize[i]) { + err = -1; + break; + } + } + if (!err) { + *len = 1; + for (i = 0; i < ndim; i++) { (*len) *= tssize[i]; } + } + } else { + *len = tssize[ndim - 1]; + } + + if (!err) { + *off = 0; + for (i = 0; i < ndim; i++) { (*off) = (*off) * tsize[i] + tstart[i]; } + } + + return err; +} + +#ifdef PNETCDF_PROFILING +int ncchkioi_update_statistics (NC_chk *ncchkp) { + int i, j; + int cid; + NC_chk_var *varp; + + ncchkp->var_size_sum = ncchkp->var_zsize_sum = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + varp = ncchkp->vars.data + i; + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + for (j = 0; j < varp->nmychunk; j++) { + cid = varp->mychunks[j]; + ncchkp->var_zsize_sum += varp->chunk_index[cid].len; + } + ncchkp->var_size_sum += varp->nmychunk * varp->chunksize; + } + } + + return NC_NOERR; +} +#endif + +int ncchkioi_get_default_chunk_dim (NC_chk *ncchkp) { + int err = NC_NOERR, ret; + int i; + int ndim, dimid; + int len; + char *cur, *pre; + char name[1024]; + char *env = getenv ("PNETCDF_DEFAULT_CHUNK_DIM"); + + if (env != NULL) { + err = ncchkp->driver->inq (ncchkp->ncp, &ndim, NULL, NULL, NULL); + if (err != NC_NOERR) return err; + + if (ndim > ncchkp->ndim) { + ncchkp->chunkdim = NCI_Realloc (ncchkp->chunkdim, ndim * sizeof (int)); + for (i = ncchkp->ndim; i < ndim; i++) { ncchkp->chunkdim[i] = 0; } + ncchkp->ndim = ndim; + } + + cur = pre = env; + for (cur = pre = env; (*cur) != '\0'; cur++) { + if ((*cur) == ';') { + if (sscanf (pre, "%s : %d ;", name, &len) == 2) { + if (len > 0) { + ret = ncchkp->driver->inq_dimid (ncchkp->ncp, name, &dimid); + if (ret == NC_NOERR) { ncchkp->chunkdim[dimid] = len; } + } + } + pre = cur + 1; + } + } + } + + return NC_NOERR; +} + +/* in-place byte swap */ +void ncchkioi_idx_in_swapn (NC_chk_chunk_index_entry *idx, MPI_Offset nelems) { + NC_chk_chunk_index_entry *bufp; + + for (bufp = idx; bufp < idx + nelems; bufp++) { + bufp->off = ((bufp->off & 0x00000000000000FFULL) << 56) | + ((bufp->off & 0x000000000000FF00ULL) << 40) | + ((bufp->off & 0x0000000000FF0000ULL) << 24) | + ((bufp->off & 0x00000000FF000000ULL) << 8) | + ((bufp->off & 0x000000FF00000000ULL) >> 8) | + ((bufp->off & 0x0000FF0000000000ULL) >> 24) | + ((bufp->off & 0x00FF000000000000ULL) >> 40) | + ((bufp->off & 0xFF00000000000000ULL) >> 56); + bufp->len = ((bufp->len) << 24) | (((bufp->len) & 0x0000ff00) << 8) | + (((bufp->len) & 0x00ff0000) >> 8) | (((bufp->len) >> 24)); + } +} diff --git a/src/drivers/ncchunkio/ncchkioi_var_init.c b/src/drivers/ncchunkio/ncchkioi_var_init.c new file mode 100644 index 0000000000..fc7e239bf7 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_var_init.c @@ -0,0 +1,577 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../ncmpio/ncmpio_NC.h" +#include "ncchkio_internal.h" + +int ncchkioi_var_init_core ( + NC_chk *ncchkp, NC_chk_var *varp, int nreq, MPI_Offset **starts, MPI_Offset **counts) { + int err=NC_NOERR; + int ret; + int i, j; + int valid; + MPI_Offset len; + + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + if (varp->chunkdim == NULL) { // This is a new uninitialized variable + // Init value + varp->mychunks = NULL; // To be added later + + // Update dimsize on rec dim + if (ncchkp->recdim >= 0) { + if (varp->dimsize[0] < ncchkp->recsize) { varp->dimsize[0] = ncchkp->recsize; } + } + + // Determine its block size + varp->chunkdim = (int *)NCI_Malloc (sizeof (int) * varp->ndim); + varp->nchunks = (int *)NCI_Malloc (sizeof (int) * varp->ndim); + + // First check attribute + valid = 1; + ret = ncchkp->driver->inq_att (ncchkp->ncp, varp->varid, "_chunkdim", NULL, &len); + if (ret == NC_NOERR && len == varp->ndim) { + ret = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_chunkdim", + varp->chunkdim, MPI_INT); + if (ret != NC_NOERR) { valid = 0; } + // chunkdim must be at leasst 1 + for (j = 0; j < varp->ndim; j++) { + if (varp->chunkdim[j] <= 0) { + valid = 0; + printf ("Warning: chunk size invalid, use default"); + break; + } + } + } else { + valid = 0; + } + + // Now, try global default + if ((!valid) && ncchkp->chunkdim) { + valid = 1; + for (i = 0; i < varp->ndim; i++) { + if (ncchkp->chunkdim[varp->dimids[i]] > 0) { + varp->chunkdim[i] = ncchkp->chunkdim[varp->dimids[i]]; + } else { + valid = 0; + break; + } + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_META) + + // Still no clue, try to infer form I/O pattern (expensive) + // If there is no I/O records, the default is just set to entire variable (only 1 chunk) + if (!valid) { + // Infering not supported + err = ncchkioi_calc_chunk_size (ncchkp, varp, nreq, starts, counts); + CHK_ERR + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_META) + + // Calculate total # chunks, # chunks along each dim, chunksize + varp->nchunkrec = 1; + varp->chunksize = NC_Type_size (varp->xtype); + for (i = 0; i < varp->ndim; i++) { // chunkdim must be at leasst 1 + if (varp->dimsize[i] % varp->chunkdim[i] == 0) { + varp->nchunks[i] = (int)(varp->dimsize[i] / (MPI_Offset)varp->chunkdim[i]); + } else { + varp->nchunks[i] = (int)(varp->dimsize[i] / (MPI_Offset)varp->chunkdim[i] + 1); + } + if (i > 0) { varp->nchunkrec *= varp->nchunks[i]; } + varp->chunksize *= varp->chunkdim[i]; + } + if (varp->isrec) { + varp->nrec = varp->nchunks[0]; + varp->nrecalloc = ncchkp->default_recnalloc; + while (varp->nrecalloc < varp->nchunks[0]) { + varp->nrecalloc *= NC_CHK_REC_MULTIPLIER; + } + } else { + varp->nrec = 1; + varp->nrecalloc = 1; + varp->nchunkrec *= varp->nchunks[0]; + } + varp->nchunk = varp->nchunkrec * varp->nrec; + varp->nchunkalloc = varp->nrecalloc * varp->nchunkrec; + + // Calculate number of chunks below each dimension + varp->cidsteps = (int *)NCI_Malloc (sizeof (int) * varp->ndim); + varp->cidsteps[varp->ndim - 1] = 1; + for (i = varp->ndim - 2; i >= 0; i--) { + varp->cidsteps[i] = varp->cidsteps[i + 1] * varp->nchunks[i + 1]; + } + + // Determine block ownership + varp->dirty = (int *)NCI_Malloc (sizeof (int) * varp->nchunkalloc); + varp->chunk_cache = (NC_chk_cache **)NCI_Malloc (sizeof (char *) * varp->nchunkalloc); + memset (varp->chunk_cache, 0, sizeof (char *) * varp->nchunkalloc); + memset (varp->dirty, 0, sizeof (int) * varp->nchunkalloc); + + // Block ownership to be decisded later + varp->chunk_owner = (int *)NCI_Malloc (sizeof (int) * varp->nchunkalloc); + + // Determine block offset + varp->chunk_index = (NC_chk_chunk_index_entry *)NCI_Malloc ( + sizeof (NC_chk_chunk_index_entry) * (varp->nchunkalloc + 1)); + + // Try if there are offset recorded in attributes, it can happen after opening a file + if (varp->isnew) { + varp->metaoff = -1; + ; + memset (varp->chunk_index, 0, + sizeof (NC_chk_chunk_index_entry) * (varp->nchunk + 1)); + } + + /* Select compression driver based on attribute */ + ret = ncchkp->driver->inq_att (ncchkp->ncp, varp->varid, "_filter", NULL, &len); + if (ret == NC_NOERR && len == 1) { + ret = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_filter", + &(varp->filter), MPI_INT); + if (ret != NC_NOERR) { return err; } + } else { + varp->filter = ncchkp->default_filter; + } + switch (varp->filter) { + case NC_CHK_FILTER_NONE: + varp->filter_driver = NULL; + break; + case NC_CHK_FILTER_DUMMY: + varp->filter_driver = ncchk_dummy_inq_driver (); + break; +#ifdef ENABLE_ZLIB + case NC_CHK_FILTER_ZLIB: + varp->filter_driver = ncchk_zlib_inq_driver (); + break; +#endif +#ifdef ENABLE_SZ + case NC_CHK_FILTER_SZ: + varp->filter_driver = ncchk_sz_inq_driver (); + break; +#endif + default: + if (ncchkp->rank == 0) { + printf ("Warning: Unknown filter driver id %d, use NC_CHK_FILTER_DUMMY\n", + varp->filter); + } + varp->filter_driver = ncchk_dummy_inq_driver (); + break; + break; + } + + // Update max ndim and chunksize + if (ncchkp->max_ndim < varp->ndim) { ncchkp->max_ndim = varp->ndim; } + if (ncchkp->max_chunk_size < varp->chunksize) { + ncchkp->max_chunk_size = varp->chunksize; + } + + if (ncchkp->cache_limit_hint == -1) { + ncchkp->cache_limit += (size_t) (varp->nmychunkrec) * (size_t) (varp->chunksize); + } + } + } + +err_out:; + return err; +} + +int ncchkioi_var_init ( + NC_chk *ncchkp, NC_chk_var *varp, int nreq, MPI_Offset **starts, MPI_Offset **counts) { + int err=NC_NOERR; + + err = ncchkioi_var_init_core (ncchkp, varp, nreq, starts, counts); + CHK_ERR + + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + err = ncchkioi_calc_chunk_owner (ncchkp, varp, nreq, starts, counts); + CHK_ERR + } + +err_out:; + return err; +} + +void ncchkioi_var_free (NC_chk_var *varp) { + + if (varp->chunkdim != NULL) { + NCI_Free (varp->dimsize); + NCI_Free (varp->chunkdim); + NCI_Free (varp->dimids); + NCI_Free (varp->nchunks); + NCI_Free (varp->cidsteps); + NCI_Free (varp->chunk_index); + NCI_Free (varp->chunk_owner); + NCI_Free (varp->dirty); + // for(i = 0; i < varp->nmychunk; i++){ + // if (varp->chunk_cache[varp->mychunks[i]] != NULL){ + // NCI_Free(varp->chunk_cache[varp->mychunks[i]]); + // } + //} + NCI_Free (varp->chunk_cache); + NCI_Free (varp->mychunks); + } +} + +int ncchkioi_init_nvar_core_gather (NC_chk *ncchkp, + int nvar, + NC_chk_var **varps, + int *rcnt, + int *roff, + MPI_Offset **starts, + MPI_Offset **counts) { + int err=NC_NOERR; + int i, j; + NC_chk_var *varp; + ncchkioi_chunk_overlap_t *ocnt[2], *ocnt_all[2]; + size_t ocnt_size[2]; + MPI_Status stat; + MPI_Request req; + + // Iinit vars + ocnt_size[0] = ocnt_size[1] = 0; + ocnt[0] = ocnt[1] = NULL; + for (i = 0; i < nvar; i++) { + varp = varps[i]; + j = i & 1; + + err = ncchkioi_var_init_core (ncchkp, varp, rcnt[i], starts + roff[i], counts + roff[i]); + CHK_ERR + + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + if (varp->nchunkrec > ocnt_size[j]) { + ocnt_size[j] = varp->nchunkrec; + NCI_Free (ocnt[j]); + ocnt[j] = (ncchkioi_chunk_overlap_t *)NCI_Malloc ( + sizeof (ncchkioi_chunk_overlap_t) * varp->nchunkrec * 2); + ocnt_all[j] = ocnt[j] + varp->nchunkrec; + } + + err = ncchkioi_calc_chunk_overlap (ncchkp, varp, rcnt[i], starts, counts, ocnt[j]); + CHK_ERR + } + + if ((i > 0) && (req != MPI_REQUEST_NULL)) { // Wait comm for prev var + err = MPI_Wait (&req, &stat); + ncchkioi_assign_chunk_owner (ncchkp, varps[i - 1], ocnt_all[(i - 1) & 1]); + ncchkioi_write_chunk_ocnt (ncchkp, varps[i - 1], ocnt[(i - 1) & 1], + sizeof (ncchkioi_chunk_overlap_t)); + } + + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + err = ncchkioi_sync_ocnt_reduce (ncchkp, varp->nchunkrec, ocnt[j], ocnt_all[j], &req); + CHK_ERR + } else { + req = MPI_REQUEST_NULL; + } + } + // Last var + if (req != MPI_REQUEST_NULL) { + err = MPI_Wait (&req, &stat); + ncchkioi_assign_chunk_owner (ncchkp, varp, ocnt_all[(i - 1) & 1]); + ncchkioi_write_chunk_ocnt (ncchkp, varp, ocnt[(i - 1) & 1], + sizeof (ncchkioi_chunk_overlap_t)); + } + +err_out:; + NCI_Free (ocnt[0]); + NCI_Free (ocnt[1]); + return err; +} + +int ncchkioi_init_nvar_core_reduce (NC_chk *ncchkp, + int nvar, + NC_chk_var **varps, + int *rcnt, + int *roff, + MPI_Offset **starts, + MPI_Offset **counts) { + int err=NC_NOERR; + int i, j; + NC_chk_var *varp; + ncchkioi_chunk_overlap_t *ocnt[2], *ocnt_all[2]; + size_t ocnt_size[2]; + MPI_Status stat; + MPI_Request req; + + // Iinit vars + ocnt_size[0] = ocnt_size[1] = 0; + ocnt[0] = ocnt[1] = NULL; + for (i = 0; i < nvar; i++) { + varp = varps[i]; + j = i & 1; + + err = ncchkioi_var_init_core (ncchkp, varp, rcnt[i], starts + roff[i], counts + roff[i]); + CHK_ERR + + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_COWN) + + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + if (varp->nchunkrec > ocnt_size[j]) { + ocnt_size[j] = varp->nchunkrec; + NCI_Free (ocnt[j]); + ocnt[j] = (ncchkioi_chunk_overlap_t *)NCI_Malloc ( + sizeof (ncchkioi_chunk_overlap_t) * varp->nchunkrec * 2); + ocnt_all[j] = ocnt[j] + varp->nchunkrec; + } + + err = ncchkioi_calc_chunk_overlap (ncchkp, varp, rcnt[i], starts + roff[i], + counts + roff[i], ocnt[j]); + CHK_ERR + } + + if ((i > 0) && (req != MPI_REQUEST_NULL)) { // Wait comm for prev var + err = MPI_Wait (&req, &stat); + ncchkioi_assign_chunk_owner (ncchkp, varps[i - 1], ocnt_all[(i - 1) & 1]); + ncchkioi_write_chunk_ocnt (ncchkp, varps[i - 1], ocnt[(i - 1) & 1], + sizeof (ncchkioi_chunk_overlap_t)); + } + + if (varp->varkind == NC_CHK_VAR_COMPRESSED) { + ncchkioi_sync_ocnt_reduce (ncchkp, varp->nchunkrec, ocnt[j], ocnt_all[j], &req); + } else { + req = MPI_REQUEST_NULL; + } + + NC_CHK_TIMER_STOPEX (NC_CHK_TIMER_VAR_INIT_COWN, NC_CHK_TIMER_VAR_INIT_META) + } + // Last var + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_COWN) + if (req != MPI_REQUEST_NULL) { + err = MPI_Wait (&req, &stat); + ncchkioi_assign_chunk_owner (ncchkp, varp, ocnt_all[(i - 1) & 1]); + ncchkioi_write_chunk_ocnt (ncchkp, varp, ocnt[(i - 1) & 1], + sizeof (ncchkioi_chunk_overlap_t)); + } + NC_CHK_TIMER_STOPEX (NC_CHK_TIMER_VAR_INIT_COWN, NC_CHK_TIMER_VAR_INIT_META) + +err_out:; + NCI_Free (ocnt[0]); + NCI_Free (ocnt[1]); + return err; +} + +int ncchkioi_init_nvar (NC_chk *ncchkp, int nput, int *putreqs, int nget, int *getreqs) { + int err = NC_NOERR, ret; + int i, j; + int nflag; + unsigned int *flag, *flag_all; + int nvar; + int *vmap; + NC_chk_var *varp; + NC_chk_var **varps; + int *rcnt, *roff; + MPI_Offset **starts, **counts; + NC_chk_req *req; + int nread; + int *lens; + MPI_Aint *fdisps, *mdisps; + MPI_Datatype ftype, mtype; + MPI_Status status; + + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_META) + + CHK_ERR_ALLREDUCE (MPI_IN_PLACE, &(ncchkp->recsize), 1, MPI_LONG_LONG, MPI_MAX, + ncchkp->comm); // Sync number of recs + + // Flag of touched vars + nflag = ncchkp->vars.cnt / 32 + 1; + flag = (unsigned int *)NCI_Malloc (sizeof (int) * nflag * 2); + CHK_PTR (flag) + flag_all = flag + nflag; + memset (flag, 0, sizeof (int) * nflag); + for (i = 0; i < nput; i++) { + req = ncchkp->putlist.reqs + putreqs[i]; + flag[req->varid >> 5] |= 1u << (req->varid % 32); + } + for (i = 0; i < nget; i++) { + req = ncchkp->getlist.reqs + getreqs[i]; + flag[req->varid >> 5] |= 1u << (req->varid % 32); + } + + // Sync flag + CHK_ERR_ALLREDUCE (flag, flag_all, nflag, MPI_UNSIGNED, MPI_BOR, ncchkp->comm); + + // Build a skip list of touched vars + nvar = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (flag_all[i >> 5] & (1u << (i % 32))) { + if ((ncchkp->vars.data + i)->chunkdim == NULL) { // If not yet inited + nvar++; + } else { + flag_all[i >> 5] ^= (1u << (i % 32)); + if ((ncchkp->vars.data + i)->dimsize[0] < ncchkp->recsize) { + ncchkioi_var_resize (ncchkp, ncchkp->vars.data + i); + } + } + } + } + varps = (NC_chk_var **)NCI_Malloc (sizeof (NC_chk_var *) * nvar); + CHK_PTR (varps) + vmap = (int *)NCI_Malloc (sizeof (int) * ncchkp->vars.cnt); + CHK_PTR (vmap) + nvar = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (flag_all[i >> 5] & (1u << (i % 32))) { + varps[nvar] = ncchkp->vars.data + i; + vmap[i] = nvar++; + } + } + + // Count reqs for each var + roff = (int *)NCI_Malloc (sizeof (int) * (nvar + 1)); + CHK_PTR (roff) + rcnt = (int *)NCI_Malloc (sizeof (int) * nvar); + CHK_PTR (rcnt) + memset (rcnt, 0, sizeof (int) * nvar); + for (i = 0; i < nput; i++) { + req = ncchkp->putlist.reqs + putreqs[i]; + j = req->varid; + if (flag_all[j >> 5] & (1u << (j % 32))) { rcnt[vmap[j]] += req->nreq; } + } + for (i = 0; i < nget; i++) { + req = ncchkp->getlist.reqs + getreqs[i]; + j = req->varid; + if (flag_all[j >> 5] & (1u << (j % 32))) { rcnt[vmap[j]] += req->nreq; } + } + roff[0] = 0; + for (i = 0; i < nvar; i++) { roff[i + 1] = roff[i] + rcnt[i]; } + + // Gather starts and counts + starts = (MPI_Offset **)NCI_Malloc (sizeof (MPI_Offset *) * roff[nvar] * 2); + CHK_PTR (starts) + counts = starts + roff[nvar]; + memset (rcnt, 0, sizeof (int) * nvar); + for (i = 0; i < nput; i++) { + req = ncchkp->putlist.reqs + putreqs[i]; + j = req->varid; + if (flag_all[j >> 5] & (1u << (j % 32))) { + j = vmap[req->varid]; + if (req->nreq > 1) { + memcpy (starts + roff[j] + rcnt[j], req->starts, sizeof (MPI_Offset *) * req->nreq); + memcpy (counts + roff[j] + rcnt[j], req->counts, sizeof (MPI_Offset *) * req->nreq); + rcnt[j] += req->nreq; + } else { + starts[roff[j] + rcnt[j]] = req->start; + counts[roff[j] + (rcnt[j]++)] = req->count; + } + } + } + for (i = 0; i < nget; i++) { + req = ncchkp->getlist.reqs + getreqs[i]; + j = req->varid; + if (flag_all[j >> 5] & (1u << (j % 32))) { + j = vmap[req->varid]; + if (req->nreq > 1) { + memcpy (starts + roff[j] + rcnt[j], req->starts, sizeof (MPI_Offset *) * req->nreq); + memcpy (counts + roff[j] + rcnt[j], req->counts, sizeof (MPI_Offset *) * req->nreq); + rcnt[j] += req->nreq; + } else { + starts[roff[j] + rcnt[j]] = req->start; + counts[roff[j] + (rcnt[j]++)] = req->count; + } + } + } + + // Buffer for index table type + lens = NCI_Malloc (sizeof (int) * nvar); + CHK_PTR (lens) + fdisps = NCI_Malloc (sizeof (MPI_Aint) * nvar * 2); + CHK_PTR (fdisps) + mdisps = fdisps + nvar; + nread = 0; + + // Iinit vars + ncchkp->cown_size = 0; // Reset owner penalty + err = ncchkioi_init_nvar_core_reduce (ncchkp, nvar, varps, rcnt, roff, starts, counts); + CHK_ERR + + // Read the index table for existing variables + // MPI Type to load the index table for existing variables + for (i = 0; i < nvar; i++) { + varp = varps[i]; + if (!(varp->isnew)) { + ret = ncchkp->driver->get_att (ncchkp->ncp, varp->varid, "_metaoffset", + &(varp->metaoff), MPI_LONG_LONG); + if (ret == NC_NOERR) { + lens[nread] = sizeof (NC_chk_chunk_index_entry) * (varp->nchunk); + fdisps[nread] = varp->metaoff; + mdisps[nread++] = (MPI_Aint) (varp->chunk_index); + } else { + varp->metaoff = -1; + memset (varp->chunk_index, 0, + sizeof (NC_chk_chunk_index_entry) * (varp->nchunk + 1)); + } + } + } + if (nread) { + ncchkioi_sort_file_offset (nread, fdisps, mdisps, lens); + + MPI_Type_create_hindexed (nread, lens, fdisps, MPI_BYTE, &ftype); + CHK_ERR_TYPE_COMMIT (&ftype); + + MPI_Type_create_hindexed (nread, lens, mdisps, MPI_BYTE, &mtype); + CHK_ERR_TYPE_COMMIT (&mtype); + + // Set file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, ((NC *)(ncchkp->ncp))->begin_var, + MPI_BYTE, ftype, "native", MPI_INFO_NULL); + + // Read data + CHK_ERR_READ_AT_ALL (((NC *)(ncchkp->ncp))->collective_fh, 0, MPI_BOTTOM, 1, mtype, + &status); + + // Restore file view + CHK_ERR_SET_VIEW (((NC *)(ncchkp->ncp))->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", + MPI_INFO_NULL); + +#ifdef WORDS_BIGENDIAN // Switch back to little endian + for (i = 0; i < nvar; i++) { + ncchkioi_idx_in_swapn (varps[i]->chunk_index, varps[i]->nchunk + 1); + } +#endif + + MPI_Type_free (&ftype); + MPI_Type_free (&mtype); + } + + NCI_Free (lens); + NCI_Free (fdisps); + + NCI_Free (flag); + NCI_Free (varps); + NCI_Free (vmap); + NCI_Free (roff); + NCI_Free (rcnt); + NCI_Free (starts); + +err_out:; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_META) + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_var_rd.c b/src/drivers/ncchunkio/ncchkioi_var_rd.c new file mode 100644 index 0000000000..26aa47126c --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_var_rd.c @@ -0,0 +1,759 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../ncmpio/ncmpio_NC.h" +#include "ncchkio_internal.h" + +int ncchkioi_load_var (NC_chk *ncchkp, NC_chk_var *varp, int nchunk, int *cids) { + int err=NC_NOERR; + int i; + int cid; + int get_size; + + int dsize; + MPI_Offset bsize; + + int *lens; + MPI_Aint *fdisps, *mdisps; + MPI_Status status; + MPI_Datatype ftype, mtype; // Memory and file datatype + + char **zbufs; + + NC *ncp = (NC *)(ncchkp->ncp); + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO_INIT) + + // -1 means all chunks + if (nchunk < 0) { + nchunk = varp->nmychunk; + cids = varp->mychunks; + } + + // Allocate buffer for I/O + lens = (int *)NCI_Malloc (sizeof (int) * nchunk); + fdisps = (MPI_Aint *)NCI_Malloc (sizeof (MPI_Aint) * nchunk * 2); + mdisps = fdisps + nchunk; + zbufs = (char **)NCI_Malloc (sizeof (char *) * nchunk); + + /* Carry our coll I/O + * OpenMPI will fail when set view or do I/O on type created with MPI_Type_create_hindexed when + * count is 0 We use a dummy call inplace of type with 0 count + */ + if (nchunk > 0) { + // Create file type + bsize = 0; + for (i = 0; i < nchunk; i++) { + cid = cids[i]; + // offset and length of compressed chunks + lens[i] = varp->chunk_index[cid].len; + fdisps[i] = (MPI_Aint) (varp->chunk_index[cid].off) + ncp->begin_var; + mdisps[i] = bsize; + // At the same time, we record the size of buffer we need + bsize += (MPI_Offset)lens[i]; + } + + // Allocate buffer for compressed data + zbufs[0] = (char *)NCI_Malloc (bsize); + for (i = 1; i < nchunk; i++) { + zbufs[i] = zbufs[i - 1] + varp->chunk_index[cids[i - 1]].len; + } + + ncchkioi_sort_file_offset (nchunk, fdisps, mdisps, lens); + + MPI_Type_create_hindexed (nchunk, lens, fdisps, MPI_BYTE, &ftype); + CHK_ERR_TYPE_COMMIT (&ftype); + + MPI_Type_create_hindexed (nchunk, lens, mdisps, MPI_BYTE, &mtype); + CHK_ERR_TYPE_COMMIT (&mtype); + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_GET_IO_INIT, NC_CHK_TIMER_GET_IO_RD) + + // Perform MPI-IO + // Set file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, ftype, "native", MPI_INFO_NULL); + // Write data + CHK_ERR_READ_AT_ALL (ncp->collective_fh, 0, zbufs[0], 1, mtype, &status); + // Restore file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_GET_IO_RD, NC_CHK_TIMER_GET_IO_INIT) + +#ifdef _USE_MPI_GET_COUNT + MPI_Get_count (&status, MPI_BYTE, &get_size); +#else + MPI_Type_size (ftype, &get_size); +#endif + ncchkp->getsize += get_size; + + // Free type + MPI_Type_free (&ftype); + MPI_Type_free (&mtype); + } else { + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_GET_IO_INIT, NC_CHK_TIMER_GET_IO_RD) + + // Follow coll I/O with dummy call + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + CHK_ERR_READ_AT_ALL (ncp->collective_fh, 0, &i, 0, MPI_BYTE, &status); + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_GET_IO_RD, NC_CHK_TIMER_GET_IO_INIT) + } + + // Decompress each chunk + // Allocate chunk cache if not allocated + if (varp->filter_driver != NULL) { + varp->filter_driver->init (MPI_INFO_NULL); + dsize = varp->chunksize; + for (i = 0; i < nchunk; i++) { + cid = cids[i]; + if (varp->chunk_cache[cid] == NULL) { + err = ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO_DECOM) + varp->filter_driver->decompress (zbufs[i], lens[i], varp->chunk_cache[cid]->buf, &dsize, + varp->ndim, varp->chunkdim, varp->etype); + NC_CHK_TIMER_STOPEX (NC_CHK_TIMER_GET_IO_DECOM, NC_CHK_TIMER_GET_IO_INIT) + + if (dsize != varp->chunksize) { printf ("Decompress Error\n"); } + } + varp->filter_driver->finalize (); + } else { + for (i = 0; i < nchunk; i++) { + cid = cids[i]; + if (varp->chunk_cache[cid] == NULL) { + err = ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO_DECOM) + memcpy (varp->chunk_cache[cid]->buf, zbufs[i], lens[i]); + NC_CHK_TIMER_STOPEX (NC_CHK_TIMER_GET_IO_DECOM, NC_CHK_TIMER_GET_IO_INIT) + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_IO_INIT) + + // Free buffers + if (nchunk > 0) { NCI_Free (zbufs[0]); } + NCI_Free (zbufs); + + NCI_Free (lens); + NCI_Free (fdisps); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_IO) + +err_out:; + return err; +} + +int ncchkioi_load_nvar (NC_chk *ncchkp, int nvar, int *varids, int *lo, int *hi) { + int err=NC_NOERR; + int i, j, k; + int cid; + int get_size; + + int nchunk; + + int dsize; + MPI_Offset bsize; + + int *lens; + MPI_Aint *fdisps, *mdisps; + MPI_Status status; + MPI_Datatype ftype, mtype; // Memory and file datatype + + char **zbufs; + + NC *ncp = (NC *)(ncchkp->ncp); + NC_chk_var *varp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO) + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO_INIT) + + // -1 means all chunks + nchunk = 0; + for (i = 0; i < nvar; i++) { + varp = ncchkp->vars.data + varids[i]; + + for (j = lo[i]; j < hi[i]; j++) { + cid = varp->mychunks[j]; + if (varp->chunk_cache[cid] == NULL && varp->chunk_index[cid].len > 0) { nchunk++; } + } + } + + // Allocate buffer for I/O + lens = (int *)NCI_Malloc (sizeof (int) * nchunk); + fdisps = (MPI_Aint *)NCI_Malloc (sizeof (MPI_Aint) * nchunk * 2); + mdisps = fdisps + nchunk; + zbufs = (char **)NCI_Malloc (sizeof (char *) * nchunk); + + /* Carry our coll I/O + * OpenMPI will fail when set view or do I/O on type created with MPI_Type_create_hindexed when + * count is 0 We use a dummy call inplace of type with 0 count + */ + if (nchunk > 0) { + // Create file type + bsize = 0; + k = 0; + for (i = 0; i < nvar; i++) { + varp = ncchkp->vars.data + varids[i]; + + for (j = lo[i]; j < hi[i]; j++) { + cid = varp->mychunks[j]; + + // We only need to read when it is not in cache + if (varp->chunk_cache[cid] == NULL && varp->chunk_index[cid].len > 0) { + // offset and length of compressed chunks + lens[k] = varp->chunk_index[cid].len; + fdisps[k] = (MPI_Aint) (varp->chunk_index[cid].off + ncp->begin_var); + mdisps[k] = bsize; + // At the same time, we record the size of buffer we need + bsize += (MPI_Offset)lens[k++]; + } + } + } + + // Allocate buffer for compressed data + // We allocate it continuously so no mem type needed + zbufs[0] = (char *)NCI_Malloc (bsize); + for (j = 1; j < nchunk; j++) { zbufs[j] = zbufs[j - 1] + lens[j - 1]; } + + ncchkioi_sort_file_offset (k, fdisps, mdisps, lens); + + MPI_Type_create_hindexed (nchunk, lens, fdisps, MPI_BYTE, &ftype); + CHK_ERR_TYPE_COMMIT (&ftype); + + MPI_Type_create_hindexed (nchunk, lens, mdisps, MPI_BYTE, &mtype); + CHK_ERR_TYPE_COMMIT (&mtype); + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_GET_IO_INIT, NC_CHK_TIMER_GET_IO_RD) + + // Perform MPI-IO + // Set file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, ftype, "native", MPI_INFO_NULL); + // Write data + CHK_ERR_READ_AT_ALL (ncp->collective_fh, 0, zbufs[0], 1, mtype, &status); + // Restore file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + +#ifdef _USE_MPI_GET_COUNT + MPI_Get_count (&status, MPI_BYTE, &get_size); +#else + MPI_Type_size (ftype, &get_size); +#endif + ncchkp->getsize += get_size; + + // Free type + MPI_Type_free (&ftype); + MPI_Type_free (&mtype); + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_GET_IO_RD, NC_CHK_TIMER_GET_IO_CACHE) + + k = 0; + for (i = 0; i < nvar; i++) { + varp = ncchkp->vars.data + varids[i]; + dsize = varp->chunksize; + + // Decompress each chunk + if (varp->filter_driver != NULL) { + varp->filter_driver->init (MPI_INFO_NULL); + + for (j = lo[i]; j < hi[i]; j++) { + cid = varp->mychunks[j]; + + // Allocate chunk cache if not allocated + if (varp->chunk_cache[cid] == NULL) { + err = + ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + + // Perform decompression + if (varp->chunk_index[cid].len > 0) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO_DECOM) + varp->filter_driver->decompress (zbufs[k], lens[k], varp->chunk_cache[cid]->buf, + &dsize, varp->ndim, varp->chunkdim, varp->etype); + if (dsize != varp->chunksize) { printf ("Decompress Error\n"); } + k++; + NC_CHK_TIMER_STOPEX (NC_CHK_TIMER_GET_IO_DECOM, + NC_CHK_TIMER_GET_IO_CACHE) + } else { + memset (varp->chunk_cache[cid]->buf, 0, varp->chunksize); + } + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + varp->filter_driver->finalize (); + } else { + for (j = lo[i]; j < hi[i]; j++) { + cid = varp->mychunks[j]; + + // Allocate chunk cache if not allocated + if (varp->chunk_cache[cid] == NULL) { + err = + ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + + if (varp->chunk_index[cid].len > 0) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_IO_DECOM) + memcpy (varp->chunk_cache[cid]->buf, zbufs[k], lens[k]); + k++; + NC_CHK_TIMER_STOPEX (NC_CHK_TIMER_GET_IO_DECOM, + NC_CHK_TIMER_GET_IO_CACHE) + } else { + memset (varp->chunk_cache[cid]->buf, 0, varp->chunksize); + } + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_IO_CACHE) + } else { + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_GET_IO_INIT, NC_CHK_TIMER_GET_IO_CACHE) + + for (i = 0; i < nvar; i++) { + varp = ncchkp->vars.data + varids[i]; + + for (j = lo[i]; j < hi[i]; j++) { + cid = varp->mychunks[j]; + + // Allocate chunk cache if not allocated + if (varp->chunk_cache[cid] == NULL) { + err = ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + memset (varp->chunk_cache[cid]->buf, 0, varp->chunksize); + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + } + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_GET_IO_CACHE, NC_CHK_TIMER_GET_IO_RD) + + // Follow coll I/O with dummy call + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + CHK_ERR_READ_AT_ALL (ncp->collective_fh, 0, &i, 0, MPI_BYTE, &status); + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_IO_RD) + } + + // Free buffers + if (nchunk > 0) { NCI_Free (zbufs[0]); } + NCI_Free (zbufs); + + NCI_Free (lens); + NCI_Free (fdisps); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_IO) + +err_out:; + return err; +} + +int ncchkioi_load_var_bg (NC_chk *ncchkp, NC_chk_var *varp, int nchunk, int *cids) { + int err=NC_NOERR; + int i; + int cid; + int get_size; + + int dsize; + MPI_Offset bsize; + + int *lens; + MPI_Aint *fdisps, *mdisps; + MPI_Status status; + MPI_Datatype ftype, mtype; // Memory and file datatype + + char **zbufs; + + NC *ncp = (NC *)(ncchkp->ncp); + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_BG) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_BG_INIT) + + // -1 means all chunks + if (nchunk < 0) { + nchunk = varp->nmychunk; + cids = varp->mychunks; + } + + // Allocate buffer for I/O + lens = (int *)NCI_Malloc (sizeof (int) * nchunk); + fdisps = (MPI_Aint *)NCI_Malloc (sizeof (MPI_Aint) * nchunk * 2); + mdisps = fdisps + nchunk; + zbufs = (char **)NCI_Malloc (sizeof (char *) * nchunk); + + /* Carry our coll I/O + * OpenMPI will fail when set view or do I/O on type created with MPI_Type_create_hindexed when + * count is 0 We use a dummy call inplace of type with 0 count + */ + if (nchunk > 0) { + // Create file type + bsize = 0; + for (i = 0; i < nchunk; i++) { + cid = cids[i]; + // offset and length of compressed chunks + lens[i] = varp->chunk_index[cid].len; + fdisps[i] = (MPI_Aint) (varp->chunk_index[cid].off) + ncp->begin_var; + mdisps[i] = bsize; + // At the same time, we record the size of buffer we need + bsize += (MPI_Offset)lens[i]; + } + + // Allocate buffer for compressed data + zbufs[0] = (char *)NCI_Malloc (bsize); + for (i = 1; i < nchunk; i++) { + zbufs[i] = zbufs[i - 1] + varp->chunk_index[cids[i - 1]].len; + } + + ncchkioi_sort_file_offset (nchunk, fdisps, mdisps, lens); + + MPI_Type_create_hindexed (nchunk, lens, fdisps, MPI_BYTE, &ftype); + CHK_ERR_TYPE_COMMIT (&ftype); + + MPI_Type_create_hindexed (nchunk, lens, mdisps, MPI_BYTE, &mtype); + CHK_ERR_TYPE_COMMIT (&mtype); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_BG_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_BG_RD) + + // Perform MPI-IO + // Set file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, ftype, "native", MPI_INFO_NULL); + // Write data + CHK_ERR_READ_AT_ALL (ncp->collective_fh, 0, zbufs[0], 1, mtype, &status); + // Restore file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_BG_RD) + +#ifdef _USE_MPI_PUT_COUNT + MPI_Get_count (&status, MPI_BYTE, &get_size); +#else + MPI_Type_size (ftype, &get_size); +#endif + ncchkp->getsize += get_size; + + // Free type + MPI_Type_free (&ftype); + MPI_Type_free (&mtype); + } else { + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_BG_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_BG_RD) + + // Follow coll I/O with dummy call + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + CHK_ERR_READ_AT_ALL (ncp->collective_fh, 0, &i, 0, MPI_BYTE, &status); + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_BG_RD) + } + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_BG_DECOM) + + // Decompress each chunk + // Allocate chunk cache if not allocated + if (varp->filter_driver != NULL) { + varp->filter_driver->init (MPI_INFO_NULL); + dsize = varp->chunksize; + for (i = 0; i < nchunk; i++) { + cid = cids[i]; + if (varp->chunk_cache[cid] == NULL) { + err = ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + + varp->filter_driver->decompress (zbufs[i], lens[i], varp->chunk_cache[cid]->buf, &dsize, + varp->ndim, varp->chunkdim, varp->etype); + + if (dsize != varp->chunksize) { printf ("Decompress Error\n"); } + } + varp->filter_driver->finalize (); + } else { + for (i = 0; i < nchunk; i++) { + cid = cids[i]; + if (varp->chunk_cache[cid] == NULL) { + err = ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + + memcpy (varp->chunk_cache[cid]->buf, zbufs[i], lens[i]); + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_BG_DECOM) + + // Free buffers + if (nchunk > 0) { NCI_Free (zbufs[0]); } + NCI_Free (zbufs); + + NCI_Free (lens); + NCI_Free (fdisps); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_BG) + +err_out:; + return err; +} + +int ncchkioi_load_nvar_bg (NC_chk *ncchkp, int nvar, int *varids, int *lo, int *hi) { + int err=NC_NOERR; + int i, j, k; + int cid; + int get_size; + + int nchunk; + + int dsize; + MPI_Offset bsize; + + int *lens; + MPI_Aint *fdisps, *mdisps; + MPI_Status status; + MPI_Datatype ftype, mtype; // Memory and file datatype + + char **zbufs; + + NC *ncp = (NC *)(ncchkp->ncp); + NC_chk_var *varp; + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_BG) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_BG_INIT) + + // -1 means all chunks + nchunk = 0; + for (i = 0; i < nvar; i++) { + varp = ncchkp->vars.data + varids[i]; + + for (j = lo[i]; j < hi[i]; j++) { + cid = varp->mychunks[j]; + if (varp->chunk_cache[cid] == NULL && varp->chunk_index[cid].len > 0) { nchunk++; } + } + } + + // Allocate buffer for I/O + lens = (int *)NCI_Malloc (sizeof (int) * nchunk); + fdisps = (MPI_Aint *)NCI_Malloc (sizeof (MPI_Aint) * nchunk * 2); + mdisps = fdisps + nchunk; + zbufs = (char **)NCI_Malloc (sizeof (char *) * nchunk); + + /* Carry our coll I/O + * OpenMPI will fail when set view or do I/O on type created with MPI_Type_create_hindexed when + * count is 0 We use a dummy call inplace of type with 0 count + */ + if (nchunk > 0) { + // Create file type + bsize = 0; + k = 0; + for (i = 0; i < nvar; i++) { + varp = ncchkp->vars.data + varids[i]; + + for (j = lo[i]; j < hi[i]; j++) { + cid = varp->mychunks[j]; + + // We only need to read when it is not in cache + if (varp->chunk_cache[cid] == NULL && varp->chunk_index[cid].len > 0) { + // offset and length of compressed chunks + lens[k] = varp->chunk_index[cid].len; + fdisps[k] = (MPI_Aint) (varp->chunk_index[cid].off + ncp->begin_var); + mdisps[k] = bsize; + // At the same time, we record the size of buffer we need + bsize += (MPI_Offset)lens[k++]; + } + } + } + + // Allocate buffer for compressed data + // We allocate it continuously so no mem type needed + zbufs[0] = (char *)NCI_Malloc (bsize); + for (j = 1; j < nchunk; j++) { zbufs[j] = zbufs[j - 1] + lens[j - 1]; } + + ncchkioi_sort_file_offset (k, fdisps, mdisps, lens); + + MPI_Type_create_hindexed (nchunk, lens, fdisps, MPI_BYTE, &ftype); + CHK_ERR_TYPE_COMMIT (&ftype); + + MPI_Type_create_hindexed (nchunk, lens, mdisps, MPI_BYTE, &mtype); + CHK_ERR_TYPE_COMMIT (&mtype); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_BG_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_BG_RD) + + // Perform MPI-IO + // Set file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, ftype, "native", MPI_INFO_NULL); + // Write data + CHK_ERR_READ_AT_ALL (ncp->collective_fh, 0, zbufs[0], 1, mtype, &status); + // Restore file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + +#ifdef _USE_MPI_PUT_COUNT + MPI_Get_count (&status, MPI_BYTE, &get_size); +#else + MPI_Type_size (ftype, &get_size); +#endif + ncchkp->getsize += get_size; + + // Free type + MPI_Type_free (&ftype); + MPI_Type_free (&mtype); + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_PUT_BG_RD, NC_CHK_TIMER_PUT_BG_CACHE) + + k = 0; + for (i = 0; i < nvar; i++) { + varp = ncchkp->vars.data + varids[i]; + dsize = varp->chunksize; + + // Decompress each chunk + if (varp->filter_driver != NULL) { + varp->filter_driver->init (MPI_INFO_NULL); + + for (j = lo[i]; j < hi[i]; j++) { + cid = varp->mychunks[j]; + + // Allocate chunk cache if not allocated + if (varp->chunk_cache[cid] == NULL) { + err = + ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + + // Perform decompression + if (varp->chunk_index[cid].len > 0) { + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_BG_DECOM) + varp->filter_driver->decompress (zbufs[k], lens[k], varp->chunk_cache[cid]->buf, + &dsize, varp->ndim, varp->chunkdim, varp->etype); + if (dsize != varp->chunksize) { printf ("Decompress Error\n"); } + k++; + NC_CHK_TIMER_STOPEX (NC_CHK_TIMER_PUT_BG_DECOM, + NC_CHK_TIMER_PUT_BG_CACHE) + } else { + memset (varp->chunk_cache[cid]->buf, 0, varp->chunksize); + } + } else { + // Cache is always up to date, no need to read and decompress + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + varp->filter_driver->finalize (); + } else { + for (j = lo[i]; j < hi[i]; j++) { + cid = varp->mychunks[j]; + + // Allocate chunk cache if not allocated + if (varp->chunk_cache[cid] == NULL) { + err = + ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + + if (varp->chunk_index[cid].len > 0) { + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_BG_DECOM) + memcpy (varp->chunk_cache[cid]->buf, zbufs[k], lens[k]); + k++; + NC_CHK_TIMER_STOPEX (NC_CHK_TIMER_PUT_BG_DECOM, + NC_CHK_TIMER_PUT_BG_CACHE) + } else { + memset (varp->chunk_cache[cid]->buf, 0, varp->chunksize); + } + } else { + // Cache is always up to date, no need to read and decompress + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_BG_CACHE) + } else { + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_PUT_BG_INIT, NC_CHK_TIMER_PUT_BG_CACHE) + + for (i = 0; i < nvar; i++) { + varp = ncchkp->vars.data + varids[i]; + + for (j = lo[i]; j < hi[i]; j++) { + cid = varp->mychunks[j]; + + // Allocate chunk cache if not allocated + if (varp->chunk_cache[cid] == NULL) { + err = ncchkioi_cache_alloc (ncchkp, varp->chunksize, varp->chunk_cache + cid); + CHK_ERR + // varp->chunk_cache[cid] = (char*)NCI_Malloc(varp->chunksize); + memset (varp->chunk_cache[cid]->buf, 0, varp->chunksize); + } else { + ncchkioi_cache_visit (ncchkp, varp->chunk_cache[cid]); + } + } + } + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_PUT_BG_CACHE, NC_CHK_TIMER_PUT_BG_RD) + + // Follow coll I/O with dummy call + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + CHK_ERR_READ_AT_ALL (ncp->collective_fh, 0, &i, 0, MPI_BYTE, &status); + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_BG_RD) + } + + // Free buffers + if (nchunk > 0) { NCI_Free (zbufs[0]); } + NCI_Free (zbufs); + + NCI_Free (lens); + NCI_Free (fdisps); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_BG) + +err_out:; + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_var_resize.c b/src/drivers/ncchunkio/ncchkioi_var_resize.c new file mode 100644 index 0000000000..cf8d98f244 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_var_resize.c @@ -0,0 +1,169 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../ncmpio/ncmpio_NC.h" +#include "ncchkio_internal.h" + +int ncchkioi_var_resize (NC_chk *ncchkp, NC_chk_var *varp) { + int err=NC_NOERR; + int i; + int cid; + + if (varp->varkind == NC_CHK_VAR_COMPRESSED && varp->isrec) { + if (varp->dimsize[0] < ncchkp->recsize) { + int oldnchunk; + int oldnmychunk; + + // oldnrec = varp->nrec; + oldnchunk = varp->nchunk; + oldnmychunk = varp->nmychunk; + varp->nrec = varp->dimsize[0] = varp->nchunks[0] = ncchkp->recsize; + varp->nchunk = varp->nchunkrec * varp->nrec; + + // Extend metadata list if needed + if (varp->nrec > varp->nrecalloc) { + while (varp->nrecalloc < varp->nrec) { varp->nrecalloc *= NC_CHK_REC_MULTIPLIER; } + varp->nchunkalloc = varp->nrecalloc * varp->nchunkrec; + + varp->chunk_owner = + (int *)NCI_Realloc (varp->chunk_owner, sizeof (int) * varp->nchunkalloc); + varp->dirty = (int *)NCI_Realloc (varp->dirty, sizeof (int) * varp->nchunkalloc); + varp->chunk_cache = (NC_chk_cache **)NCI_Realloc ( + varp->chunk_cache, sizeof (char *) * varp->nchunkalloc); + for (i = 0; i < oldnmychunk; i++) { + cid = varp->mychunks[i]; + if (varp->chunk_cache[cid] != NULL) { + varp->chunk_cache[cid]->ref = varp->chunk_cache + cid; + } + } + + varp->chunk_index = (NC_chk_chunk_index_entry *)NCI_Realloc ( + varp->chunk_index, sizeof (NC_chk_chunk_index_entry) * (varp->nchunkalloc + 1)); + varp->mychunks = (int *)NCI_Realloc ( + varp->mychunks, sizeof (int) * varp->nrecalloc * varp->nmychunkrec); + + varp->expanded = 1; + } + memset (varp->chunk_index + oldnchunk, 0, + sizeof (NC_chk_chunk_index_entry) * (varp->nchunk - oldnchunk)); + memset (varp->dirty + oldnchunk, 0, sizeof (int) * (varp->nchunk - oldnchunk)); + memset (varp->chunk_cache + oldnchunk, 0, sizeof (char *) * (varp->nchunk - oldnchunk)); + + // Extend block ownership list + if (oldnchunk > 0) { + for (i = oldnchunk; i < varp->nchunk; i += varp->nchunkrec) { + // We reuse chunk mapping of other records + memcpy (varp->chunk_owner + i, varp->chunk_owner, + sizeof (int) * varp->nchunkrec); + } + varp->nmychunk = varp->nmychunkrec * varp->nrec; + for (i = oldnmychunk; i < varp->nmychunk; i += varp->nmychunkrec) { + // We reuse chunk mapping of other records + memcpy (varp->mychunks + i, varp->mychunks, sizeof (int) * varp->nmychunkrec); + } + } else { + err = ncchkioi_calc_chunk_owner (ncchkp, varp, 0, NULL, NULL); + CHK_ERR + + varp->nmychunkrec = 0; + for (i = 0; i < varp->nchunkrec; i++) { + if (varp->chunk_owner[i] == ncchkp->rank) { varp->nmychunkrec++; } + } + varp->mychunks = + (int *)NCI_Realloc (varp->mychunks, sizeof (int) * varp->nmychunkrec * varp->nrecalloc); + + if (ncchkp->cache_limit_hint == -1) { + ncchkp->cache_limit += + (size_t) (varp->nmychunkrec) * (size_t) (varp->chunksize); + } + } + + varp->nmychunk = oldnmychunk; + for (i = oldnchunk; i < varp->nchunk; i++) { + if (varp->chunk_owner[i] == ncchkp->rank) { + varp->mychunks[varp->nmychunk++] = i; + // varp->chunk_cache[i] = (void*)NCI_Malloc(varp->chunksize); // Allocate + // buffer for blocks we own memset(varp->chunk_cache[i], 0 , varp->chunksize); + } + } + + // Update global chunk count + ncchkp->nmychunks += (MPI_Offset) (varp->nmychunk - oldnmychunk); + } + } else { + // Notify ncmpio driver + } + +err_out:; + return err; +} + +int ncchkioi_resize_nvar (NC_chk *ncchkp, int nput, int *putreqs, int nget, int *getreqs) { + int err=NC_NOERR; + int i; + int nflag; + unsigned int *flag = NULL, *flag_all; + NC_chk_req *req; + + CHK_ERR_ALLREDUCE (MPI_IN_PLACE, &(ncchkp->recsize), 1, MPI_LONG_LONG, MPI_MAX, + ncchkp->comm); // Sync number of recs + + // Flag of touched vars + nflag = ncchkp->vars.cnt / 32 + 1; + flag = (unsigned int *)NCI_Malloc (sizeof (int) * nflag * 2); + CHK_PTR (flag) + flag_all = flag + nflag; + memset (flag, 0, sizeof (int) * nflag); + for (i = 0; i < nput; i++) { + req = ncchkp->putlist.reqs + putreqs[i]; + flag[req->varid >> 5] |= 1u << (req->varid % 32); + } + for (i = 0; i < nget; i++) { + req = ncchkp->getlist.reqs + getreqs[i]; + flag[req->varid >> 5] |= 1u << (req->varid % 32); + } + + // Sync flag + CHK_ERR_ALLREDUCE (flag, flag_all, nflag, MPI_UNSIGNED, MPI_BOR, ncchkp->comm); + + // Resize each var + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (flag_all[i >> 5] & (1u << (i % 32))) { + flag_all[i >> 5] ^= (1u << (i % 32)); + if ((ncchkp->vars.data + i)->dimsize[0] < ncchkp->recsize) { + err = ncchkioi_var_resize (ncchkp, ncchkp->vars.data + i); + CHK_ERR + } + } + } + + NCI_Free (flag); + +err_out:; + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_var_wr.c b/src/drivers/ncchunkio/ncchkioi_var_wr.c new file mode 100644 index 0000000000..e0fa8e12a8 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_var_wr.c @@ -0,0 +1,636 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../ncmpio/ncmpio_NC.h" +#include "ncchkio_internal.h" + +int ncchkioi_save_var (NC_chk *ncchkp, NC_chk_var *varp) { + int i, k, l, err = NC_NOERR; + int *zsizes = NULL, *zsizes_all = NULL; + MPI_Datatype mtype, ftype; // Memory and file datatype + int wcnt; + int *lens = NULL; + MPI_Aint *disps = NULL; + MPI_Status status; + MPI_Offset *zoffs = NULL; + MPI_Offset voff; + void **zbufs = NULL; + int zdimid, zvarid; + int put_size; + char name[128]; // Name of objects + NC *ncp = (NC *)(ncchkp->ncp); + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO) + + // Allocate buffer for compression + zsizes = (int *)NCI_Malloc (sizeof (int) * varp->nchunk); + CHK_PTR (zsizes) + zbufs = (void **)NCI_Malloc (sizeof (void *) * varp->nmychunk); + CHK_PTR (zbufs) + zsizes_all = (int *)NCI_Malloc (sizeof (int) * varp->nchunk); + CHK_PTR (zsizes_all) + zoffs = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * (varp->nchunk + 1)); + CHK_PTR (zoffs) + + // Allocate buffer for I/O + wcnt = 0; + for (l = 0; l < varp->nmychunk; l++) { + k = varp->mychunks[l]; + if (varp->dirty[k]) { wcnt++; } + } + if (ncchkp->rank == varp->chunk_owner[0]) { wcnt += 1; } + lens = (int *)NCI_Malloc (sizeof (int) * wcnt); + CHK_PTR (lens) + disps = (MPI_Aint *)NCI_Malloc (sizeof (MPI_Aint) * wcnt); + CHK_PTR (disps) + + memset (zsizes, 0, sizeof (int) * varp->nchunk); + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO_COM) + + // Compress each chunk we own + if (varp->filter_driver != NULL) { + varp->filter_driver->init (MPI_INFO_NULL); + for (l = 0; l < varp->nmychunk; l++) { + k = varp->mychunks[l]; + + if (varp->dirty[k]) { + // Apply compression + err = varp->filter_driver->compress_alloc (varp->chunk_cache[k]->buf, varp->chunksize, + zbufs + l, zsizes + k, varp->ndim, varp->chunkdim, + varp->etype); + CHK_ERR + } + } + varp->filter_driver->finalize (); + } else { + for (l = 0; l < varp->nmychunk; l++) { + k = varp->mychunks[l]; + if (varp->dirty[k]) { + zbufs[l] = varp->chunk_cache[k]->buf; + zsizes[k] = varp->chunksize; + } + } + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_COM) + +#ifdef PNETCDF_PROFILING + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO_BARR) + MPI_Barrier (ncchkp->comm); + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_BARR) +#endif + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO_SYNC) + + // Sync compressed data size with other processes + CHK_ERR_ALLREDUCE (zsizes, zsizes_all, varp->nchunk, MPI_INT, MPI_MAX, ncchkp->comm); + + if (varp->metaoff < 0 || varp->expanded) { + zoffs[0] = varp->nchunkalloc * sizeof (NC_chk_chunk_index_entry); + } else { + zoffs[0] = 0; + } + for (i = 0; i < varp->nchunk; i++) { zoffs[i + 1] = zoffs[i] + zsizes_all[i]; } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_SYNC) + + if (zoffs[varp->nchunk] > 0) { // No need to do I/O if no dirty chunk to write + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO_INIT) + + /* Write comrpessed variable + * We start by defining data variable and writing metadata + * Then, we create buffer type and file type for data + * Finally MPI collective I/O is used for writing data + */ + + // Enter redefine mode + ncchkp->driver->redef (ncchkp->ncp); + + // Prepare data variable + + // Define dimension for data variable + sprintf (name, "_datablock_dim_%d", ncchkp->nwrite); + err = ncchkp->driver->def_dim (ncchkp->ncp, name, zoffs[varp->nchunk], &zdimid); + if (err != NC_NOERR) return err; + + // Define data variable + sprintf (name, "_datablock_%d", ncchkp->nwrite); + err = ncchkp->driver->def_var (ncchkp->ncp, name, NC_BYTE, 1, &zdimid, &(zvarid)); + if (err != NC_NOERR) return err; + + // Mark as data variable + i = NC_CHK_VAR_DATA; + err = ncchkp->driver->put_att (ncchkp->ncp, zvarid, "_varkind", NC_INT, 1, &i, MPI_INT); + if (err != NC_NOERR) return err; + + // Record serial + ncchkp->nwrite++; + err = ncchkp->driver->put_att (ncchkp->ncp, NC_GLOBAL, "_nwrite", NC_INT, 1, + &(ncchkp->nwrite), MPI_INT); + if (err != NC_NOERR) return err; + + // Metadata offset + // Real metadata offset is only known after enddef + // We reserve the space so we don't need to enter define mode again + if (varp->metaoff < 0) { + err = ncchkp->driver->put_att (ncchkp->ncp, varp->varid, "_metaoffset", NC_INT64, 1, + &(varp->metaoff), MPI_LONG_LONG); + if (err != NC_NOERR) return err; + } + + // Switch to data mode + err = ncchkp->driver->enddef (ncchkp->ncp); + if (err != NC_NOERR) return err; + + // Update metadata + voff = ncp->vars.value[zvarid]->begin; + for (i = 0; i < varp->nchunk; i++) { + if (zsizes_all[i] > 0) { + varp->chunk_index[i].len = zsizes_all[i]; + varp->chunk_index[i].off = zoffs[i] + voff - ncp->begin_var; + } + } + + if (varp->metaoff < 0 || varp->expanded) { + varp->metaoff = voff - ncp->begin_var; + err = ncchkp->driver->put_att (ncchkp->ncp, varp->varid, "_metaoffset", NC_INT64, 1, + &(varp->metaoff), MPI_LONG_LONG); + if (err != NC_NOERR) return err; + + // unset expand flag + varp->expanded = 0; + } + + /* Carry out coll I/O + * OpenMPI will fail when set view or do I/O on type created with MPI_Type_create_hindexed + * when count is 0 We use a dummy call inplace of type with 0 count + */ + if (wcnt > 0) { + // Create file type + l = 0; + if (ncchkp->rank == varp->chunk_owner[0]) { // First chunk owner writes metadata + lens[l] = (varp->nchunk) * sizeof (NC_chk_chunk_index_entry); + disps[l++] = (MPI_Aint)varp->metaoff + ncp->begin_var; + } + for (i = 0; i < varp->nmychunk; i++) { + k = varp->mychunks[i]; + + // Record compressed size + if (varp->dirty[k]) { + lens[l] = zsizes[k]; + disps[l++] = (MPI_Aint) (varp->chunk_index[k].off) + ncp->begin_var; + } + } + MPI_Type_create_hindexed (wcnt, lens, disps, MPI_BYTE, &ftype); + CHK_ERR_TYPE_COMMIT (&ftype); + + // Create memory buffer type + l = 0; + if (ncchkp->rank == varp->chunk_owner[0]) { // First chunk owner writes metadata + lens[l] = (varp->nchunk) * sizeof (NC_chk_chunk_index_entry); + disps[l++] = (MPI_Aint)varp->chunk_index; + } + for (i = 0; i < varp->nmychunk; i++) { + k = varp->mychunks[i]; + + // Record compressed size + if (varp->dirty[k]) { + lens[l] = zsizes[k]; + disps[l++] = (MPI_Aint)zbufs[i]; + } + } + err = MPI_Type_create_hindexed (wcnt, lens, disps, MPI_BYTE, &mtype); + CHK_ERR_TYPE_COMMIT (&mtype); + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_PUT_IO_INIT, NC_CHK_TIMER_PUT_IO_WR) + +#ifdef WORDS_BIGENDIAN // NetCDF data is big endian + if (ncchkp->rank == varp->chunk_owner[0]) { + ncchkioi_idx_in_swapn (varp - chunk_index, varp->nchunk); + } +#endif + + // Perform MPI-IO + // Set file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, ftype, "native", MPI_INFO_NULL); + // Write data + CHK_ERR_WRITE_AT_ALL (ncp->collective_fh, 0, MPI_BOTTOM, 1, mtype, &status); + // Restore file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + +#ifdef WORDS_BIGENDIAN // Switch back to little endian + if (ncchkp->rank == varp->chunk_owner[0]) { + ncchkioi_idx_in_swapn (varp - chunk_index, varp->nchunk); + } +#endif + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_WR) + +#ifdef _USE_MPI_GET_COUNT + MPI_Get_count (&status, MPI_BYTE, &put_size); +#else + MPI_Type_size (mtype, &put_size); +#endif + ncchkp->putsize += put_size; + + // Free type + MPI_Type_free (&ftype); + MPI_Type_free (&mtype); + } else { + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_PUT_IO_INIT, NC_CHK_TIMER_PUT_IO_WR) + + // Follow coll I/O with dummy call + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + CHK_ERR_WRITE_AT_ALL (ncp->collective_fh, 0, MPI_BOTTOM, 0, MPI_BYTE, &status); + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_WR) + } + } + +err_out:; + // Free buffers + NCI_Free (zsizes); + NCI_Free (zsizes_all); + NCI_Free (zoffs); + for (l = 0; l < varp->nmychunk; l++) { + k = varp->mychunks[l]; + if (varp->dirty[k]) { + if (varp->filter_driver != NULL) { free (zbufs[l]); } + // Clear dirty flag + varp->dirty[k] = 0; + } + } + NCI_Free (zbufs); + + NCI_Free (lens); + NCI_Free (disps); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO) + + return err; +} + +int ncchkioi_save_nvar (NC_chk *ncchkp, int nvar, int *varids) { + int i, k, l, err = NC_NOERR; + int vid; // Iterator for variable id + int cid; // Iterator for chunk id + int total_nchunks = 0; + int *zsizes = NULL, *zsizes_all = NULL, *zsizesp = NULL, *zsizes_allp = NULL; + MPI_Offset *zoffs = NULL, *zoffsp; + MPI_Offset voff; + MPI_Datatype mtype, ftype; // Memory and file datatype + int wcnt, ccnt, wcur, ccur; + int *lens = NULL; + MPI_Aint *mdisps = NULL, *fdisps = NULL; + MPI_Status status; + MPI_Request *reqs = NULL; + int put_size; + void **zbufs = NULL; + int *zdels = NULL; + int zdimid, zvarid; + char name[128]; // Name of objects + NC_chk_var *varp; + NC *ncp = (NC *)(ncchkp->ncp); + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO) + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO_INIT) + + wcnt = 0; + ccnt = 0; + for (i = 0; i < nvar; i++) { + varp = ncchkp->vars.data + varids[i]; + if (ncchkp->rank == varp->chunk_owner[0]) { wcnt += 1; } + for (l = 0; l < varp->nmychunk; l++) { + k = varp->mychunks[l]; + if (varp->dirty[k]) { ccnt++; } + } + total_nchunks += varp->nchunk + 1; + } + wcnt += ccnt; + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_INIT) + + // Allocate reqid for metadata + reqs = (MPI_Request *)NCI_Malloc (sizeof (MPI_Request) * nvar); + CHK_PTR (reqs) + + // Allocate buffer for compression + zsizes = (int *)NCI_Malloc (sizeof (int) * total_nchunks); + CHK_PTR (zsizes) + zsizes_all = (int *)NCI_Malloc (sizeof (int) * total_nchunks); + CHK_PTR (zsizes_all) + zbufs = (void **)NCI_Malloc (sizeof (void *) * ccnt); + CHK_PTR (zbufs) + zdels = (int *)NCI_Malloc (sizeof (int) * ccnt); + CHK_PTR (zdels) + zoffs = (MPI_Offset *)NCI_Malloc (sizeof (MPI_Offset) * (total_nchunks + 1)); + CHK_PTR (zoffs) + + // Allocate buffer file type + mdisps = (MPI_Aint *)NCI_Malloc (sizeof (MPI_Aint) * wcnt); + CHK_PTR (mdisps) + lens = (int *)NCI_Malloc (sizeof (int) * wcnt); + CHK_PTR (lens) + fdisps = (MPI_Aint *)NCI_Malloc (sizeof (MPI_Aint) * wcnt); + CHK_PTR (fdisps) + + ccur = 0; + zsizesp = zsizes + nvar; + zsizes_allp = zsizes_all + nvar; + for (vid = 0; vid < nvar; vid++) { + varp = ncchkp->vars.data + varids[vid]; + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO_COM) + + // oldzoff = zoffs[varp->nchunk]; + + memset (zsizesp, 0, sizeof (int) * varp->nchunk); + + // Compress each chunk we own + if (varp->filter_driver != NULL) { + varp->filter_driver->init (MPI_INFO_NULL); + for (l = 0; l < varp->nmychunk; l++) { + cid = varp->mychunks[l]; + + // Apply compression + if (varp->dirty[cid]) { + zdels[ccur] = 1; + err = varp->filter_driver->compress_alloc (varp->chunk_cache[cid]->buf, varp->chunksize, + zbufs + (ccur++), zsizesp + cid, varp->ndim, + varp->chunkdim, varp->etype); + CHK_ERR + } + } + varp->filter_driver->finalize (); + } else { + for (l = 0; l < varp->nmychunk; l++) { + cid = varp->mychunks[l]; + if (varp->dirty[cid]) { + zsizesp[cid] = varp->chunksize; + zdels[ccur] = 0; + zbufs[ccur++] = varp->chunk_cache[cid]->buf; + } + } + } + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_PUT_IO_COM, NC_CHK_TIMER_PUT_IO_SYNC) + + // Sync compressed data size with other processes + CHK_ERR_IALLREDUCE (zsizesp, zsizes_allp, varp->nchunk, MPI_INT, MPI_MAX, ncchkp->comm, + reqs + vid); + + if (varp->metaoff < 0 || varp->expanded) { + zsizes_all[vid] = varp->nchunkalloc * sizeof (NC_chk_chunk_index_entry); + } else { + zsizes_all[vid] = 0; + } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_SYNC) + + zsizesp += varp->nchunk; + zsizes_allp += varp->nchunk; + } + +#ifdef PNETCDF_PROFILING + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO_BARR) + MPI_Barrier (ncchkp->comm); + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_BARR) +#endif + + /* Write comrpessed variable + * We start by defining data variable and writing metadata + * Then, we create buffer type and file type for data + * Finally MPI collective I/O is used for writing data + */ + + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO_SYNC) + zsizes_allp = zsizes_all + nvar; + for (vid = 0; vid < nvar; vid++) { + varp = ncchkp->vars.data + varids[vid]; + CHK_ERR_WAIT (reqs + vid, &status); + zsizes_allp += varp->nchunk; + } + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_SYNC) + + zoffs[0] = 0; + for (i = 0; i < total_nchunks; i++) { zoffs[i + 1] = zoffs[i] + zsizes_all[i]; } + + if (zoffs[total_nchunks] > 0) { // No need to do I/O if no dirty chunk to write + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT_IO_INIT) + + // Prepare data variable + + // Enter redefine mode + ncchkp->driver->redef (ncchkp->ncp); + + // Define dimension for data variable + sprintf (name, "_datablock_dim_%d", ncchkp->nwrite); + err = ncchkp->driver->def_dim (ncchkp->ncp, name, zoffs[total_nchunks], &zdimid); + if (err != NC_NOERR) return err; + + // Define data variable + sprintf (name, "_datablock_%d", ncchkp->nwrite); + err = ncchkp->driver->def_var (ncchkp->ncp, name, NC_BYTE, 1, &zdimid, &zvarid); + if (err != NC_NOERR) return err; + + // Mark as data variable + i = NC_CHK_VAR_DATA; + err = ncchkp->driver->put_att (ncchkp->ncp, zvarid, "_varkind", NC_INT, 1, &i, MPI_INT); + if (err != NC_NOERR) return err; + + // Record serial + ncchkp->nwrite++; + err = ncchkp->driver->put_att (ncchkp->ncp, NC_GLOBAL, "_nwrite", NC_INT, 1, + &(ncchkp->nwrite), MPI_INT); + if (err != NC_NOERR) return err; + + // Metadata offset + for (vid = 0; vid < nvar; vid++) { + varp = ncchkp->vars.data + varids[vid]; + // Reserve space for _metaoffset + if (varp->metaoff < 0) { + err = ncchkp->driver->put_att (ncchkp->ncp, varp->varid, "_metaoffset", NC_INT64, 1, + &(varp->metaoff), MPI_LONG_LONG); + if (err != NC_NOERR) return err; + } + } + + // Switch back to data mode + err = ncchkp->driver->enddef (ncchkp->ncp); + if (err != NC_NOERR) return err; + + voff = ncp->vars.value[zvarid]->begin; + + wcur = ccur = 0; + for (vid = 0; vid < nvar; vid++) { + varp = ncchkp->vars.data + varids[vid]; + + if (varp->metaoff < 0 || varp->expanded) { + varp->metaoff = zoffs[vid] + voff - ncp->begin_var; + err = ncchkp->driver->put_att (ncchkp->ncp, varp->varid, "_metaoffset", NC_INT64, 1, + &(varp->metaoff), MPI_LONG_LONG); + if (err != NC_NOERR) return err; + + // unset expand flag + varp->expanded = 0; + } + + if (ncchkp->rank == varp->chunk_owner[0]) { // First chunk owner writes metadata + lens[wcur] = varp->nchunk * sizeof (NC_chk_chunk_index_entry); + fdisps[wcur] = (MPI_Aint)varp->metaoff + ncp->begin_var; + mdisps[wcur++] = (MPI_Aint) (varp->chunk_index); + + // lens[wcur] = varp->nchunk * sizeof(int); + // fdisps[wcur] = (MPI_Aint)(varp->metaoff + ncp->begin_var + sizeof(long long) * + // varp->nchunkalloc); mdisps[wcur++] = (MPI_Aint)(varp->data_lens); + } + } + + ncchkioi_sort_file_offset (wcur, fdisps, mdisps, lens); + + zsizes_allp = zsizes_all + nvar; + zoffsp = zoffs + nvar; + for (vid = 0; vid < nvar; vid++) { + varp = ncchkp->vars.data + varids[vid]; + + for (cid = 0; cid < varp->nchunk; cid++) { + if (zsizes_allp[cid] > 0) { + varp->chunk_index[cid].len = zsizes_allp[cid]; + varp->chunk_index[cid].off = zoffsp[cid] + voff - ncp->begin_var; + } + } + + /* Paramemter for file and memory type + * We do not know variable file offset until the end of define mode + * We will add the displacement later + */ + for (i = 0; i < varp->nmychunk; i++) { + cid = varp->mychunks[i]; + + // Record parameter + if (varp->dirty[cid]) { + lens[wcur] = varp->chunk_index[cid].len; + fdisps[wcur] = (MPI_Aint) (varp->chunk_index[cid].off) + ncp->begin_var; + mdisps[wcur++] = (MPI_Aint)zbufs[ccur++]; + } + } + + // Clear dirty flag + memset (varp->dirty, 0, varp->nchunk * sizeof (int)); + + zsizes_allp += varp->nchunk; + zoffsp += varp->nchunk; + } + + NC_CHK_TIMER_SWAP (NC_CHK_TIMER_PUT_IO_INIT, NC_CHK_TIMER_PUT_IO_WR) + + /* Carry our coll I/O + * OpenMPI will fail when set view or do I/O on type created with MPI_Type_create_hindexed + * when count is 0 We use a dummy call inplace of type with 0 count + */ + if (wcnt > 0) { + // Create file type + MPI_Type_create_hindexed (wcnt, lens, fdisps, MPI_BYTE, &ftype); + CHK_ERR_TYPE_COMMIT (&ftype); + + // Create memmory type + MPI_Type_create_hindexed (wcnt, lens, mdisps, MPI_BYTE, &mtype); + CHK_ERR_TYPE_COMMIT (&mtype); + +#ifdef WORDS_BIGENDIAN // NetCDF data is big endian + for (vid = 0; vid < nvar; vid++) { + varp = ncchkp->vars.data + varids[vid]; + if (ncchkp->rank == varp->chunk_owner[0]) { + ncchkioi_idx_in_swapn (varp - chunk_index, varp->nchunk + 1); + } + } +#endif + + // Perform MPI-IO + // Set file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, ftype, "native", MPI_INFO_NULL); + // Write data + CHK_ERR_WRITE_AT_ALL (ncp->collective_fh, 0, MPI_BOTTOM, 1, mtype, &status); + // Restore file view + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + +#ifdef WORDS_BIGENDIAN // Switch back to little endian + for (vid = 0; vid < nvar; vid++) { + varp = ncchkp->vars.data + varids[vid]; + if (ncchkp->rank == varp->chunk_owner[0]) { + ncchkioi_idx_in_swapn (varp - chunk_index, varp->nchunk + 1); + } + } +#endif + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_WR) + +#ifdef _USE_MPI_GET_COUNT + MPI_Get_count (&status, MPI_BYTE, &put_size); +#else + MPI_Type_size (mtype, &put_size); +#endif + ncchkp->putsize += put_size; + + // Free type + MPI_Type_free (&ftype); + MPI_Type_free (&mtype); + } else { + // Follow coll I/O with dummy call + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + CHK_ERR_WRITE_AT_ALL (ncp->collective_fh, 0, MPI_BOTTOM, 0, MPI_BYTE, &status); + CHK_ERR_SET_VIEW (ncp->collective_fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO_WR) + } + } + +err_out:; + // Free buffers + NCI_Free (zsizes); + NCI_Free (zsizes_all); + NCI_Free (zoffs); + ccur = 0; + for (i = 0; i < ccnt; i++) { + if (zdels[i]) { free (zbufs[i]); } + } + NCI_Free (zbufs); + NCI_Free (zdels); + + NCI_Free (lens); + NCI_Free (fdisps); + NCI_Free (mdisps); + + NCI_Free (reqs); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT_IO) + + return err; +} diff --git a/src/drivers/ncchunkio/ncchkioi_vector.c b/src/drivers/ncchunkio/ncchkioi_vector.c new file mode 100644 index 0000000000..adf519eb07 --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_vector.c @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2018, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include +#include +#include "ncchkio_internal.h" + +#define STARTSIZE 32 +#define SIZEMUTIPLIER 20 + +int ncchkioi_vector_init(NC_chk_vector *v, int esize){ + v->esize = esize; + v->nalloc = STARTSIZE; + v->size = 0; + v->data = (char*)NCI_Malloc(esize * v->nalloc); + if (v->data == NULL){ + DEBUG_RETURN_ERROR(NC_ENOMEM); + } +} + +int ncchkioi_vector_init_ex(NC_chk_vector *v, int esize, int size){ + v->esize = esize; + v->nalloc = size; + v->size = 0; + v->data = (char*)NCI_Malloc(esize * v->nalloc); + if (v->data == NULL){ + DEBUG_RETURN_ERROR(NC_ENOMEM); + } +} + +void ncchkioi_vector_free(NC_chk_vector *v){ + NCI_Free(v->data); +} + +int ncchkioi_vector_append(NC_chk_vector *v, void *item){ + if (v->size == v->nalloc){ + v->nalloc = v->nalloc * SIZEMUTIPLIER; + v->data = (char*)NCI_Realloc(v->data, v->esize * v->nalloc); + if (v->data == NULL){ + DEBUG_RETURN_ERROR(NC_ENOMEM); + } + } + memcpy(data + v->size * v->esize, item, v->esize); +} \ No newline at end of file diff --git a/src/drivers/ncchunkio/ncchkioi_wait.c b/src/drivers/ncchunkio/ncchkioi_wait.c new file mode 100644 index 0000000000..35be331ced --- /dev/null +++ b/src/drivers/ncchunkio/ncchkioi_wait.c @@ -0,0 +1,290 @@ +/* + * Copyright (C) 2019, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ +/* $Id$ */ + +/* + * This file implements the following PnetCDF APIs. + * + * ncmpi_get_var_all() : dispatcher->get_var() + * ncmpi_put_var_all() : dispatcher->put_var() + * ncmpi_get_var__all() : dispatcher->get_var() + * ncmpi_put_var__all() : dispatcher->put_var() + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ncchkio_internal.h" + +/* Out drive currently can handle only one variable at a time + * We pack all request as a large varn request + */ +int ncchkioi_wait_put_reqs (NC_chk *ncchkp, int nreq, int *reqids, int *stats) { + int err=NC_NOERR; + int i; + int nvar, nflag; + unsigned int *flag, *flag_all; + int *vids; + NC_chk_req *req; + + NC_CHK_TIMER_START (NC_CHK_TIMER_WAIT_PUT) + + // Flag of touched vars + nflag = ncchkp->vars.cnt / 32 + 1; + flag = (unsigned int *)NCI_Malloc (sizeof (int) * nflag * 2); + flag_all = flag + nflag; + memset (flag, 0, sizeof (int) * nflag); + for (i = 0; i < nreq; i++) { + req = ncchkp->putlist.reqs + reqids[i]; + flag[req->varid >> 5] |= 1u << (req->varid % 32); + } + + // Sync flag + CHK_ERR_ALLREDUCE (flag, flag_all, nflag, MPI_UNSIGNED, MPI_BOR, ncchkp->comm); + + // Build a skip list of touched vars + nvar = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (flag_all[i >> 5] & (1u << (i % 32))) { nvar++; } + } + vids = (int *)NCI_Malloc (sizeof (int) * nvar); + nvar = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (flag_all[i >> 5] & (1u << (i % 32))) { vids[nvar++] = i; } + } + + // Perform collective buffer + if (ncchkp->comm_unit == NC_CHK_COMM_CHUNK) { + err = ncchkioi_iput_cb_chunk (ncchkp, nreq, reqids, stats); + } else { + err = ncchkioi_iput_cb_proc (ncchkp, nreq, reqids, stats); + } + CHK_ERR + +#ifdef PNETCDF_PROFILING + NC_CHK_TIMER_START (NC_CHK_TIMER_WAIT_PUT_BARR) + MPI_Barrier (ncchkp->comm); + NC_CHK_TIMER_STOP (NC_CHK_TIMER_WAIT_PUT_BARR) +#endif + + // Perform I/O for comrpessed variables + err = ncchkioi_save_nvar (ncchkp, nvar, vids); + CHK_ERR + +err_out:; + + // Free buffers + NCI_Free (vids); + NCI_Free (flag); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_WAIT_PUT) + + return err; +} + +/* Out drive currently can handle only one variable at a time + * We pack all request as a large varn request + */ +int ncchkioi_wait_get_reqs (NC_chk *ncchkp, int nreq, int *reqids, int *stats) { + int err=NC_NOERR; + int i; + int nvar, nflag; + unsigned int *flag, *flag_all; + int *vids; + NC_chk_req *req; + + NC_CHK_TIMER_START (NC_CHK_TIMER_WAIT_GET) + + // Flag of touched vars + nflag = ncchkp->vars.cnt / 32 + 1; + flag = (unsigned int *)NCI_Malloc (sizeof (int) * nflag * 2); + flag_all = flag + nflag; + memset (flag, 0, sizeof (int) * nflag); + for (i = 0; i < nreq; i++) { + req = ncchkp->getlist.reqs + reqids[i]; + flag[req->varid >> 5] |= 1u << (req->varid % 32); + } + + // Sync flag + CHK_ERR_ALLREDUCE (flag, flag_all, nflag, MPI_UNSIGNED, MPI_BOR, ncchkp->comm); + + // Build a skip list of touched vars + nvar = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (flag_all[i >> 5] & (1u << (i % 32))) { nvar++; } + } + vids = (int *)NCI_Malloc (sizeof (int) * nvar); + nvar = 0; + for (i = 0; i < ncchkp->vars.cnt; i++) { + if (flag_all[i >> 5] & (1u << (i % 32))) { vids[nvar++] = i; } + } + + // Perform I/O for comrpessed variables + // ncchkioi_load_nvar(ncchkp, nvar, vids); + + // Perform collective buffer + if (ncchkp->comm_unit == NC_CHK_COMM_CHUNK) { + err = ncchkioi_iget_cb_chunk (ncchkp, nreq, reqids, stats); + } else { + err = ncchkioi_iget_cb_proc (ncchkp, nreq, reqids, stats); + // ncchkioi_iget_cb_chunk(ncchkp, nreq, reqids, stats); + } + CHK_ERR + + NC_CHK_TIMER_START (NC_CHK_TIMER_GET_CONVERT) + for (i = 0; i < nreq; i++) { + req = ncchkp->getlist.reqs + reqids[i]; + if (req->buf != req->xbuf) { + void *cbuf = (void *)req->buf; + + err = ncchkioiconvert (req->xbuf, cbuf, ncchkp->vars.data[req->varid].etype, + req->buftype, req->bufcount); + CHK_ERR + + if (cbuf != req->buf) NCI_Free (cbuf); + } + } + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET_CONVERT) + +err_out:; + + // Free buffers + NCI_Free (vids); + NCI_Free (flag); + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_WAIT_GET) + + return err; +} + +int ncchkioi_wait (NC_chk *ncchkp, int nreqs, int *reqids, int *stats, int reqMode) { + int err=NC_NOERR; + int i; + int nput = 0, nget = 0; + int *putreqs = NULL, *getreqs = NULL; + int *putstats = NULL, *getstats = NULL; + + if (nreqs == NC_REQ_ALL || nreqs == NC_PUT_REQ_ALL) { + nput = ncchkp->putlist.nused; + putreqs = (int *)NCI_Malloc (sizeof (int) * nput); + CHK_PTR (putreqs) + memcpy (putreqs, ncchkp->putlist.ids, nput * sizeof (int)); + } + if (nreqs == NC_REQ_ALL || nreqs == NC_GET_REQ_ALL) { + nget = ncchkp->getlist.nused; + getreqs = (int *)NCI_Malloc (sizeof (int) * nget); + CHK_PTR (getreqs) + memcpy (getreqs, ncchkp->getlist.ids, nget * sizeof (int)); + } + + if (nreqs > 0) { + // Count number of get and put requests + for (i = 0; i < nreqs; i++) { + if (reqids[i] & 1) { nput++; } + } + + // Allocate buffer + nget = nreqs - nput; + putreqs = (int *)NCI_Malloc (sizeof (int) * nput); + CHK_PTR (putreqs) + getreqs = (int *)NCI_Malloc (sizeof (int) * nget); + CHK_PTR (getreqs) + + // Build put and get req list + nput = nget = 0; + for (i = 0; i < nreqs; i++) { + if (reqids[i] & 1) { + putreqs[nput++] = reqids[i] >> 1; + } else { + getreqs[nget++] = reqids[i] >> 1; + } + } + } + + if (ncchkp->delay_init) { + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_WAIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_INIT_META) + + err = ncchkioi_init_nvar (ncchkp, nput, putreqs, nget, getreqs); // nput + nget = real nreq + if (err != NC_NOERR) { return err; } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT) + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_INIT_META) + NC_CHK_TIMER_START (NC_CHK_TIMER_WAIT) + } else { + NC_CHK_TIMER_PAUSE (NC_CHK_TIMER_WAIT) + NC_CHK_TIMER_START (NC_CHK_TIMER_VAR_RESIZE) + + // Sync number of rec + err = + ncchkioi_resize_nvar (ncchkp, nput, putreqs, nget, getreqs); // nput + nget = real nreq + if (err != NC_NOERR) { return err; } + + NC_CHK_TIMER_STOP (NC_CHK_TIMER_VAR_RESIZE) + NC_CHK_TIMER_START (NC_CHK_TIMER_WAIT) + } + + if (stats != NULL) { + putstats = (int *)NCI_Malloc (sizeof (int) * nput); + CHK_PTR (putstats) + getstats = (int *)NCI_Malloc (sizeof (int) * nget); + CHK_PTR (getstats) + memset (putstats, 0, sizeof (int) * nput); + memset (getstats, 0, sizeof (int) * nget); + } else { + putstats = NULL; + getstats = NULL; + } + + if ((ncchkp->mode & NC_WRITE) && nreqs != NC_GET_REQ_ALL) { + NC_CHK_TIMER_START (NC_CHK_TIMER_PUT) + err = ncchkioi_wait_put_reqs (ncchkp, nput, putreqs, putstats); + CHK_ERR + NC_CHK_TIMER_STOP (NC_CHK_TIMER_PUT) + } + + if (nreqs != NC_PUT_REQ_ALL) { + NC_CHK_TIMER_START (NC_CHK_TIMER_GET) + err = ncchkioi_wait_get_reqs (ncchkp, nget, getreqs, getstats); + CHK_ERR + NC_CHK_TIMER_STOP (NC_CHK_TIMER_GET) + } + + // Assign stats + if (stats != NULL) { + nput = nget = 0; + for (i = 0; i < nreqs; i++) { + if (reqids[i] & 1) { + stats[i] = putstats[nput++]; + } else { + stats[i] = getstats[nget++]; + } + } + + NCI_Free (putstats); + NCI_Free (getstats); + } + + // Remove from req list + for (i = 0; i < nput; i++) { ncchkioi_req_list_remove (&(ncchkp->putlist), putreqs[i]); } + for (i = 0; i < nget; i++) { ncchkioi_req_list_remove (&(ncchkp->getlist), getreqs[i]); } + +err_out:; + NCI_Free (putreqs); + NCI_Free (getreqs); + + return err; +} diff --git a/src/drivers/ncfoo/ncfoo_driver.h b/src/drivers/ncfoo/ncfoo_driver.h index b266ffa513..1381bdcb95 100644 --- a/src/drivers/ncfoo/ncfoo_driver.h +++ b/src/drivers/ncfoo/ncfoo_driver.h @@ -22,10 +22,12 @@ struct NC_foo { }; extern int -ncfoo_create(MPI_Comm comm, const char *path, int cmode, int ncid, MPI_Info info, void **ncdp); +ncfoo_create(MPI_Comm comm, const char *path, int cmode, int ncid, + int env_mode, MPI_Info info, void **ncdp); extern int -ncfoo_open(MPI_Comm comm, const char *path, int omode, int ncid, MPI_Info info, void **ncdp); +ncfoo_open(MPI_Comm comm, const char *path, int omode, int ncid, + int env_mode, MPI_Info info, void **ncdp); extern int ncfoo_close(void *ncdp); diff --git a/src/drivers/ncfoo/ncfoo_file.c b/src/drivers/ncfoo/ncfoo_file.c index d4cc2e22fc..874f9d5856 100644 --- a/src/drivers/ncfoo/ncfoo_file.c +++ b/src/drivers/ncfoo/ncfoo_file.c @@ -51,6 +51,7 @@ ncfoo_create(MPI_Comm comm, const char *path, int cmode, int ncid, + int env_mode, MPI_Info info, void **ncpp) /* OUT */ { @@ -63,7 +64,7 @@ ncfoo_create(MPI_Comm comm, driver = ncmpio_inq_driver(); if (driver == NULL) return NC_ENOTNC; - err = driver->create(comm, path, cmode, ncid, info, &ncp); + err = driver->create(comm, path, cmode, ncid, env_mode, info, &ncp); if (err != NC_NOERR) return err; /* Create a NC_foo object and save its driver pointer */ @@ -92,6 +93,7 @@ ncfoo_open(MPI_Comm comm, const char *path, int omode, int ncid, + int env_mode, MPI_Info info, void **ncpp) { @@ -110,7 +112,7 @@ ncfoo_open(MPI_Comm comm, } if (driver == NULL) return NC_ENOTNC; - err = driver->open(comm, path, omode, ncid, info, &ncp); + err = driver->open(comm, path, omode, ncid, env_mode, info, &ncp); if (err != NC_NOERR) return err; /* Create a NC_foo object and save its driver pointer */ diff --git a/src/drivers/ncmpio/Makefile.am b/src/drivers/ncmpio/Makefile.am index c1afe76c19..9cbd78e13e 100644 --- a/src/drivers/ncmpio/Makefile.am +++ b/src/drivers/ncmpio/Makefile.am @@ -12,6 +12,7 @@ AM_CPPFLAGS = -I${top_srcdir}/src/include AM_CPPFLAGS += -I${top_builddir}/src/include AM_CPPFLAGS += -I${top_srcdir}/src/drivers/include AM_CPPFLAGS += -I${top_builddir}/src/drivers/include +AM_CPPFLAGS += -I${top_srcdir}/src/drivers/pncio if PNETCDF_DEBUG AM_CPPFLAGS += -DPNETCDF_DEBUG diff --git a/src/drivers/ncmpio/ncmpio_NC.h b/src/drivers/ncmpio/ncmpio_NC.h index 0e2c71e814..995a083ff1 100644 --- a/src/drivers/ncmpio/ncmpio_NC.h +++ b/src/drivers/ncmpio/ncmpio_NC.h @@ -16,14 +16,23 @@ #include #include "ncmpio_driver.h" +#include "pncio.h" +/* default free space in the file header section. */ #define NC_DEFAULT_H_MINFREE 0 -#define NC_DEFAULT_V_ALIGN 512 + +/* default free space in the fix-sized variable section. */ #define NC_DEFAULT_V_MINFREE 0 + +/* default alignment for the starting offset of record variable section. */ #define NC_DEFAULT_R_ALIGN 4 -#define FILE_ALIGNMENT_DEFAULT 512 -#define FILE_ALIGNMENT_LB 4 +/* The default file header extent size is aligned with FILE_ALIGNMENT_DEFAULT. + * This default will be overwritten by hint nc_header_align_size or + * nc_var_align_size. Note when both hints are set by users, hint + * nc_var_align_size supersedes nc_header_align_size. + */ +#define NC_DEFAULT_V_ALIGN 512 /* MPI_OFFSET datatype was introduced in MPI 2.2 */ #if MPI_VERSION < 3 @@ -50,13 +59,25 @@ /* ncmpi_create/ncmpi_open set up header to be 'chunksize' big and to grow * by 'chunksize' as new items added. This used to be 4k. 256k lets us read - * in an entire climate header in one go */ -#define PNC_DEFAULT_CHUNKSIZE 262144 + * in an entire climate header in one go. This default will be overwritten by + * hint nc_header_read_chunk_size. + */ +#define PNC_HDR_READ_CHUNK_SIZE 262144 + +/* When file header grows or variables need to be moved to higher file offsets, + * data movement is performed in chunks of size PNC_MOVE_CHUNK_SIZE each per + * process. If the number of chunks is larger than the number of processes, + * carry out the data movement in multiple rounds. This default will be + * overwritten by hint nc_data_move_chunk_size. + */ +#define PNC_DATA_MOVE_CHUNK_SIZE 1048576 /* default size of temporal buffer to pack noncontiguous user buffers for MPI * collective read and write during ncmpi_wait/wait_all(). On some systems, * e.g. Cray KNL, using contiguous user buffers in collective I/O is much - * faster than noncontiguous. */ + * faster than noncontiguous. This default will be overwritten by hint + * nc_ibuf_size. + */ #define PNC_DEFAULT_IBUF_SIZE 16777216 /* when variable's nctype is NC_CHAR, I/O buffer's MPI type must be MPI_CHAR @@ -156,7 +177,7 @@ typedef struct { * specifications can be of type 8-byte integers. */ typedef struct NC_dimarray { - int ndefined; /* number of defined dimensions */ + int ndefined; /* no. defined dimensions */ int unlimited_id; /* -1 for not defined, otherwise >= 0 */ NC_dim **value; int hash_size; @@ -180,7 +201,7 @@ ncmpio_dup_NC_dimarray(NC_dimarray *ncap, const NC_dimarray *ref); * NC attribute */ typedef struct { - MPI_Offset nelems; /* number of attribute elements */ + MPI_Offset nelems; /* no. attribute elements */ MPI_Offset xsz; /* amount of space at xvalue (4-byte aligned) */ nc_type xtype; /* external NC data type of the attribute */ size_t name_len; /* strlen(name) for faster string compare */ @@ -199,7 +220,7 @@ typedef struct { * specifications can be of type 8-byte integers. */ typedef struct NC_attrarray { - int ndefined; /* number of defined attributes */ + int ndefined; /* no. defined attributes */ NC_attr **value; int hash_size; NC_nametable *nameT; @@ -238,7 +259,7 @@ typedef struct { int no_fill; /* whether fill mode is disabled */ size_t name_len;/* strlen(name) for faster string compare */ char *name; /* name of the variable */ - int ndims; /* number of dimensions */ + int ndims; /* no. dimensions */ int *dimids; /* [ndims] array of dimension IDs */ MPI_Offset *shape; /* [ndims] dim->size of each dim shape[0] == NC_UNLIMITED if record variable */ @@ -268,8 +289,8 @@ typedef struct { */ /* note: we only allow less than 2^31-1 variables defined in a file */ typedef struct NC_vararray { - int ndefined; /* number of defined variables */ - int num_rec_vars;/* number of defined record variables */ + int ndefined; /* no. defined variables */ + int num_rec_vars;/* no. defined record variables */ NC_var **value; int hash_size; NC_nametable *nameT; @@ -319,15 +340,15 @@ typedef struct NC_lead_req { int flag; /* bit-wise OR of the above NC_REQ_* flags */ int id; /* even number for write, odd for read */ int nonlead_off; /* start index in the non-lead queue */ - int nonlead_num; /* number of non-lead requests */ + int nonlead_num; /* no. non-lead requests */ int abuf_index; /* index in the abuf occupy_table. -1 means not using attached buffer */ void *buf; /* user buffer */ void *xbuf; /* buffer in external type, may be == buf */ NC_var *varp; /* pointer to NC variable object */ - MPI_Offset nelems; /* total number of array elements requested */ + MPI_Offset nelems; /* total no. array elements requested */ MPI_Offset max_rec; /* highest record requested */ - MPI_Offset bufcount; /* number of buftype in this request */ + MPI_Offset bufcount; /* no. buftype in this request */ MPI_Offset *start; /* [varp->ndims*3] for start/count/stride */ MPI_Datatype buftype; /* user defined derived data type */ MPI_Datatype itype; /* internal element data type in buftype */ @@ -338,10 +359,11 @@ typedef struct NC_lead_req { typedef struct NC_req { MPI_Offset offset_start; /* starting offset of aggregate access region */ MPI_Offset offset_end; /* ending offset of aggregate access region */ - MPI_Offset nelems; /* number of array elements requested */ + MPI_Offset nelems; /* no. array elements requested */ MPI_Offset *start; /* [varp->ndims*3] for start/count/stride */ void *xbuf; /* buffer in external type, used in file I/O calls */ int lead_off; /* start index in the lead queue */ + MPI_Aint npairs; /* no. flattened offset-length pairs */ } NC_req; #define NC_ABUF_DEFAULT_TABLE_SIZE 128 @@ -379,44 +401,47 @@ struct NC { int mpiomode; /* mode used in MPI_File_open, passed from * collective open to independent open */ int format; /* 1, 2, or 5 corresponding to CDF-1, 2, or 5 */ - int safe_mode; /* 0 or 1, for parameter consistency check */ #ifdef ENABLE_SUBFILING int subfile_mode; /* 0 or 1, for disable/enable subfiling */ - int num_subfiles; /* number of subfiles */ + int num_subfiles; /* no. subfiles */ struct NC *ncp_sf; /* ncp of subfile */ MPI_Comm comm_sf; /* subfile MPI communicator */ + PNCIO_node_ids node_ids_sf; /* node IDs of subfile MPI communicator */ #endif - int striping_unit; /* stripe size of the file */ - int chunk; /* chunk size for reading header, one chunk at a time */ - MPI_Offset v_align; /* alignment of the beginning of fixed-size variables */ - MPI_Offset r_align; /* file alignment for record variable section */ - MPI_Offset env_v_align; /* v_align set in environment variable */ - MPI_Offset env_r_align; /* r_align set in environment variable */ - MPI_Offset info_v_align;/* v_align set in MPI Info object */ - MPI_Offset info_r_align;/* r_align set in MPI Info object */ - MPI_Offset h_minfree; /* pad at the end of the header section */ - MPI_Offset v_minfree; /* pad at the end of the data section for fixed-size variables */ - MPI_Offset ibuf_size; /* packing buffer size for flushing noncontig - user buffer during wait */ - MPI_Offset xsz; /* size of this file header, <= var[0].begin */ - MPI_Offset begin_var; /* file offset of the first fixed-size variable, - if no fixed-sized variable, it is the offset - of first record variable. This value is also - the size of file header extent. */ - MPI_Offset begin_rec; /* file offset of the first 'record' */ - + int hdr_chunk; /* chunk size for reading header, one chunk at a time */ + int data_chunk; /* chunk size for moving variables to higher offsets */ + int nc_striping; /* PNCIO_STRIPING_AUTO or PNCIO_STRIPING_INHERIT */ + MPI_Offset v_align; /* alignment of the beginning of fixed-size variables */ + MPI_Offset r_align; /* file alignment for record variable section */ + MPI_Offset info_v_align; /* v_align set in MPI Info object */ + MPI_Offset info_r_align; /* r_align set in MPI Info object */ + MPI_Offset h_minfree; /* pad at the end of the header section */ + MPI_Offset v_minfree; /* pad at the end of the data section for fixed-size variables */ + MPI_Offset ibuf_size; /* packing buffer size for flushing noncontig + user buffer during wait */ + MPI_Offset xsz; /* size of this file header, <= var[0].begin */ + MPI_Offset begin_var; /* file offset of the first fixed-size variable, + if no fixed-sized variable, it is the offset + of first record variable. This value is also + the size of file header extent. */ + MPI_Offset begin_rec; /* file offset of the first 'record' */ + + MPI_Offset fix_end; /* end offset of last fix-sized variable */ MPI_Offset recsize; /* length of 'record': sum of single record sizes of all the record variables */ - MPI_Offset numrecs; /* number of 'records' allocated */ + MPI_Offset numrecs; /* no. 'records' allocated */ MPI_Offset put_size; /* amount of writes committed so far in bytes */ MPI_Offset get_size; /* amount of reads committed so far in bytes */ - MPI_Comm comm; /* MPI communicator */ - int rank; /* MPI rank of this process */ - int nprocs; /* number of MPI processes */ - MPI_Info mpiinfo; /* used MPI info object */ - MPI_File collective_fh; /* file handle for collective mode */ - MPI_File independent_fh; /* file handle for independent mode */ + MPI_Comm comm; /* MPI communicator */ + int rank; /* MPI rank of this process */ + int nprocs; /* no. MPI processes */ + PNCIO_node_ids node_ids; /* node IDs of each rank */ + MPI_Info mpiinfo; /* used MPI info object */ + MPI_File collective_fh; /* MPI-IO file handle for collective mode */ + MPI_File independent_fh; /* MPI-IO file handle for independent mode */ + PNCIO_File *pncio_fh; /* PNCIO file handler */ + int fstype; /* file system type: PNCIO_LUSTRE, PNCIO_UFS */ NC_dimarray dims; /* dimensions defined */ NC_attrarray attrs; /* global attributes defined */ @@ -426,36 +451,55 @@ struct NC { int maxGetReqID; /* max get request ID */ int maxPutReqID; /* max put request ID */ - int numLeadGetReqs; /* number of pending lead get requests */ - int numLeadPutReqs; /* number of pending lead put requests */ + int numLeadGetReqs; /* no. pending lead get requests */ + int numLeadPutReqs; /* no. pending lead put requests */ NC_lead_req *get_lead_list; /* list of lead nonblocking read requests */ NC_lead_req *put_lead_list; /* list of lead nonblocking write requests */ - int numGetReqs; /* number of pending nonblocking get requests */ - int numPutReqs; /* number of pending nonblocking put requests */ + int numGetReqs; /* no. pending nonblocking get requests */ + int numPutReqs; /* no. pending nonblocking put requests */ NC_req *get_list; /* list of nonblocking read requests */ NC_req *put_list; /* list of nonblocking write requests */ NC_buf *abuf; /* attached buffer, used by bput APIs */ - char *path; /* file name */ + const char *path; /* file name */ struct NC *old; /* contains the previous NC during redef. */ - /* Below are used for intra-node aggregation */ - int num_aggrs_per_node; /* number of aggregators per compute node. Set - through a user hint. 0 to disable the - intra-node aggregation, -1 to let PnetCDF to - decide. This value must be the same among all - processes. + /* Below are used for intra-node aggregation (INA) */ + MPI_Comm ina_comm; /* communicator of only intra-node aggregators */ + int ina_nprocs;/* no. processes in intra-node communicator */ + int ina_rank; /* rank ID in intra-node communicator */ + int num_aggrs_per_node; /* no. aggregators per compute node. Set through a + * user hint. 0 to disable the intra-node + * aggregation, -1 to let PnetCDF to decide.This + * value must be the same among all processes. */ int my_aggr; /* rank ID of my aggregator */ - int num_nonaggrs; /* number of non-aggregators assigned */ + int num_nonaggrs; /* no. non-aggregators assigned */ int *nonaggr_ranks; /* ranks of assigned non-aggregators */ + int *ina_node_list; /* rank IDs of INA aggregators */ + #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) - double aggr_time; + double ina_time_init; + double ina_time_flatten; + double ina_time_put[5]; + double ina_time_get[5]; + size_t ina_npairs_put; + size_t ina_npairs_get; + size_t maxmem_put[6]; + size_t maxmem_get[6]; #endif }; +typedef struct bufferinfo { + NC *ncp; + MPI_Offset offset; /* current read/write offset in the file */ + char *base; /* beginning of read/write buffer */ + char *pos; /* current position in buffer */ + char *end; /* end position of buffer */ +} bufferinfo; + #define NC_readonly(ncp) fIsSet((ncp)->flags, NC_MODE_RDONLY) #define NC_IsNew(ncp) fIsSet((ncp)->flags, NC_MODE_CREATE) #define NC_indef(ncp) fIsSet((ncp)->flags, NC_MODE_DEF) @@ -474,9 +518,6 @@ struct NC { (NC_EMULTIDEFINE_FIRST >= (err) && (err) >= NC_EMULTIDEFINE_LAST) /* Begin defined in nc.c ----------------------------------------------------*/ -extern void -ncmpio_free_NC(NC *ncp); - extern int ncmpio_NC_check_vlen(NC_var *varp, MPI_Offset vlen_max); @@ -487,20 +528,6 @@ extern int ncmpio_NC_check_voffs(NC *ncp); /* Begin defined in ncmpio_header_get.c -------------------------------------*/ -typedef struct bufferinfo { - MPI_Comm comm; - MPI_File collective_fh; - MPI_Offset get_size; /* amount of file read n bytes so far */ - MPI_Offset offset; /* current read/write offset in the file */ - int chunk; /* chunk size for reading the header */ - int version; /* 1, 2, and 5 for CDF-1, 2, and 5 respectively */ - int safe_mode;/* 0: disabled, 1: enabled */ - int coll_mode;/* 0: independent, 1: collective */ - char *base; /* beginning of read/write buffer */ - char *pos; /* current position in buffer */ - char *end; /* end position of buffer */ -} bufferinfo; - extern MPI_Offset ncmpio_hdr_len_NC(const NC *ncp); @@ -515,9 +542,6 @@ extern int ncmpio_write_header(NC *ncp); /* Begin defined in ncmpio_sync.c -------------------------------------------*/ -extern int -ncmpio_file_sync(NC *ncp); - extern int ncmpio_write_numrecs(NC *ncp, MPI_Offset new_numrecs); @@ -528,10 +552,6 @@ ncmpio_filetype_create_vars(const NC* ncp, const NC_var* varp, const MPI_Offset stride[], MPI_Offset *offset, MPI_Datatype *filetype, int *is_filetype_contig); -extern int -ncmpio_file_set_view(const NC *ncp, MPI_File fh, MPI_Offset *offset, - MPI_Datatype filetype); - /* Begin defined in ncmpio_igetput.m4 ---------------------------------------*/ extern int ncmpio_abuf_malloc(NC *ncp, MPI_Offset nbytes, void **buf, int *abuf_index); @@ -607,17 +627,16 @@ ncmpio_inq_var_fill(NC_var *varp, void *fill_value); extern int ncmpio_fill_vars(NC *ncp); -/* Begin defined in ncmpio_nonblocking.c ------------------------------------*/ -extern int -ncmpio_getput_zero_req(NC *ncp, int rw_flag); - -/* Begin defined in ncmpio_close.c */ -extern int -ncmpio_close_files(NC *ncp, int doUnlink); +/* Begin defined in ncmpio_close.c ------------------------------------------*/ +extern void +ncmpio_free_NC(NC *ncp); /* Begin defined in ncmpio_utils.c ------------------------------------------*/ extern void -ncmpio_set_pnetcdf_hints(NC *ncp, MPI_Info user_info, MPI_Info info_used); +ncmpio_hint_extract(NC *ncp, MPI_Info info); + +extern void +ncmpio_hint_set(NC *ncp, MPI_Info info); extern int ncmpio_NC_check_name(const char *name, int file_ver); @@ -644,23 +663,73 @@ ncmpio_unpack_xbuf(int format, NC_var *varp, MPI_Offset bufcount, MPI_Datatype etype, MPI_Datatype imaptype, int need_convert, int need_swap, void *buf, void *xbuf); +extern int +ncmpio_calc_off(const NC *ncp, const NC_var *varp, const MPI_Offset *start, + MPI_Offset *offset); + +extern int +ncmpio_calc_start_end(const NC *ncp, const NC_var *varp, + const MPI_Offset *start, const MPI_Offset *count, + const MPI_Offset *stride, MPI_Offset *start_off, + MPI_Offset *end_off); + /* Begin defined in ncmpio_file_io.c ----------------------------------------*/ +extern MPI_Offset +ncmpio_file_read_at(NC *ncp, MPI_Offset offset, void *buf, + PNCIO_View buf_view); + +extern MPI_Offset +ncmpio_file_read_at_all(NC *ncp, MPI_Offset offset, void *buf, + PNCIO_View buf_view); + +extern MPI_Offset +ncmpio_file_write_at(NC *ncp, MPI_Offset offset, const void *buf, + PNCIO_View buf_view); + +extern MPI_Offset +ncmpio_file_write_at_all(NC *ncp, MPI_Offset offset, const void *buf, + PNCIO_View buf_view); + +extern int +ncmpio_getput_zero_req(NC *ncp, int rw_flag); + +extern int +ncmpio_read_write(NC *ncp, int rw_flag, MPI_Offset offset, + PNCIO_View flat_btype, void *buf); + +extern int +ncmpio_file_close(NC *ncp); + +extern int +ncmpio_file_delete(NC *ncp); + +extern int +ncmpio_file_sync(NC *ncp); + +extern int +ncmpio_file_set_view(const NC *ncp, MPI_Offset disp, MPI_Datatype filetype, + MPI_Aint npairs, +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *offsets, MPI_Count *lengths +#else + MPI_Offset *offsets, int *lengths +#endif +); + extern int -ncmpio_read_write(NC *ncp, int rw_flag, int coll_indep, MPI_Offset offset, - MPI_Offset buf_count, MPI_Datatype buf_type, void *buf, - int buftype_is_contig); +ncmpio_file_open(NC *ncp, MPI_Comm comm, const char *path, int omode, + MPI_Info info); /* Begin defined in ncmpio_intranode.c --------------------------------------*/ extern int -ncmpio_intra_node_aggr_init(NC *ncp); +ncmpio_ina_init(NC *ncp); extern int -ncmpio_intra_node_aggregation_nreqs(NC *ncp, int mode, int num_reqs, - NC_req *put_list, MPI_Offset newnumrecs); +ncmpio_ina_nreqs(NC *ncp, int mode, int num_reqs, NC_req *put_list, + MPI_Offset newnumrecs); extern int -ncmpio_intra_node_aggregation(NC *ncp, int mode, NC_var *varp, - const MPI_Offset *start, const MPI_Offset *count, - const MPI_Offset *stride, MPI_Offset bufCount, - MPI_Datatype bufType, void *buf); +ncmpio_ina_req(NC *ncp, int mode, NC_var *varp, const MPI_Offset *start, + const MPI_Offset *count, const MPI_Offset *stride, + MPI_Offset nbytes, void *buf); #endif /* H_NC */ diff --git a/src/drivers/ncmpio/ncmpio_attr.m4 b/src/drivers/ncmpio/ncmpio_attr.m4 index 5b969ded70..46c0b3733f 100644 --- a/src/drivers/ncmpio/ncmpio_attr.m4 +++ b/src/drivers/ncmpio/ncmpio_attr.m4 @@ -477,7 +477,7 @@ ncmpio_rename_att(void *ncdp, err_check: if (nname != NULL) NCI_Free(nname); - if (ncp->safe_mode && ncp->nprocs > 1) { + if (fIsSet(ncp->flags, NC_MODE_SAFE) && ncp->nprocs > 1) { int minE, mpireturn; /* check error code across processes */ @@ -597,7 +597,7 @@ ncmpio_copy_att(void *ncdp_in, } err_check: - if (ncp_out->safe_mode && ncp_out->nprocs > 1) { + if (fIsSet(ncp_out->flags, NC_MODE_SAFE) && ncp_out->nprocs > 1) { int minE, mpireturn; /* check the error code across processes */ @@ -710,7 +710,7 @@ ncmpio_del_att(void *ncdp, err_check: if (nname != NULL) NCI_Free(nname); - if (ncp->safe_mode && ncp->nprocs > 1) { + if (fIsSet(ncp->flags, NC_MODE_SAFE) && ncp->nprocs > 1) { int minE, mpireturn; /* find min error code across processes */ @@ -1044,7 +1044,7 @@ ncmpio_put_att(void *ncdp, } err_check: - if (ncp->safe_mode && ncp->nprocs > 1) { + if (fIsSet(ncp->flags, NC_MODE_SAFE) && ncp->nprocs > 1) { /* check the error code across processes */ int minE, mpireturn; diff --git a/src/drivers/ncmpio/ncmpio_close.c b/src/drivers/ncmpio/ncmpio_close.c index ec79088e81..e93a74a442 100644 --- a/src/drivers/ncmpio/ncmpio_close.c +++ b/src/drivers/ncmpio/ncmpio_close.c @@ -51,46 +51,11 @@ ncmpio_free_NC(NC *ncp) if (ncp->get_list != NULL) NCI_Free(ncp->get_list); if (ncp->put_list != NULL) NCI_Free(ncp->put_list); if (ncp->abuf != NULL) NCI_Free(ncp->abuf); - if (ncp->path != NULL) NCI_Free(ncp->path); if (ncp->nonaggr_ranks != NULL) NCI_Free(ncp->nonaggr_ranks); NCI_Free(ncp); } -/*----< ncmpio_close_files() >-----------------------------------------------*/ -int -ncmpio_close_files(NC *ncp, int doUnlink) { - char *mpi_name; - int mpireturn; - - assert(ncp != NULL); /* this should never occur */ - - if (ncp->independent_fh != MPI_FILE_NULL) { - TRACE_IO(MPI_File_close, (&ncp->independent_fh)); - if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, mpi_name); - } - - if (ncp->nprocs > 1 && ncp->collective_fh != MPI_FILE_NULL) { - TRACE_IO(MPI_File_close, (&ncp->collective_fh)); - if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, mpi_name); - } - - if (doUnlink) { - /* called from ncmpi_abort, if the file is being created and is still - * in define mode, the file is deleted */ - if (ncp->rank == 0) { - TRACE_IO(MPI_File_delete, ((char *)ncp->path, ncp->mpiinfo)); - if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, mpi_name); - } - if (ncp->nprocs > 1) - MPI_Barrier(ncp->comm); - } - return NC_NOERR; -} - /*----< ncmpio_close() >------------------------------------------------------*/ /* This function is collective */ int @@ -159,8 +124,69 @@ ncmpio_close(void *ncdp) } #endif - /* calling MPI_File_close() */ - err = ncmpio_close_files(ncp, 0); +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + int i, j, ntimers; + double tt[16], max_t[16], put_time=0, get_time=0; + MPI_Offset sizes[16], max_sizes[16], max_npairs_put=0, max_npairs_get=0; + + /* print intra-node aggregation timing breakdown */ + if (ncp->num_aggrs_per_node > 0) { + j = 0; + for (i=0; i<6; i++) sizes[j++] = ncp->maxmem_put[i]; + for (i=0; i<6; i++) sizes[j++] = ncp->maxmem_get[i]; + sizes[12] = ncp->ina_npairs_put; + sizes[13] = ncp->ina_npairs_get; + + MPI_Allreduce(sizes, max_sizes, 14, MPI_OFFSET, MPI_MAX, ncp->comm); + max_npairs_put = max_sizes[12]; + max_npairs_get = max_sizes[13]; + + for (i=0; i<12; i++) tt[i] = (float)(max_sizes[i]) / 1048576.0; /* in MiB */ + if (ncp->rank == 0 && max_npairs_put > 0) + printf("%s: INA put npairs=%lld mem=%.1f %.1f %.1f %.1f %.1f %.1f (MiB)\n", + __func__, max_sizes[12], tt[0],tt[1],tt[2],tt[3],tt[4],tt[5]); + if (ncp->rank == 0 && max_npairs_get > 0) + printf("%s: INA get npairs=%lld mem=%.1f %.1f %.1f %.1f %.1f %.1f (MiB)\n", + __func__, max_sizes[13], tt[6],tt[7],tt[8],tt[9],tt[10],tt[11]); + + if (max_npairs_put > 0) { /* put npairs > 0 */ + put_time = ncp->ina_time_init + ncp->ina_time_flatten; + ntimers = 5; + for (i=0; iina_time_put[i]; + put_time += tt[i]; + } + tt[ntimers] = ncp->ina_time_init; + tt[ntimers+1] = ncp->ina_time_flatten; + tt[ntimers+2] = put_time; + + MPI_Reduce(tt, max_t, ntimers+3, MPI_DOUBLE, MPI_MAX, 0, ncp->comm); + put_time = max_t[ntimers+2]; + if (ncp->rank == 0) + printf("%s: INA put timing %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f = %5.2f\n", + __func__, max_t[ntimers],max_t[ntimers+1],max_t[0],max_t[1],max_t[2],max_t[3],max_t[4],put_time); + } + if (max_npairs_get > 0) { /* get npairs > 0 */ + get_time = ncp->ina_time_init + ncp->ina_time_flatten; + ntimers = 5; + for (i=0; iina_time_get[i]; + get_time += tt[i]; + } + tt[ntimers] = ncp->ina_time_init; + tt[ntimers+1] = ncp->ina_time_flatten; + tt[ntimers+2] = get_time; + + MPI_Reduce(tt, max_t, ntimers+3, MPI_DOUBLE, MPI_MAX, 0, ncp->comm); + if (ncp->rank == 0) + printf("%s: INA get timing %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f = %5.2f\n", + __func__, max_t[ntimers],max_t[ntimers+1],max_t[0],max_t[1],max_t[2],max_t[3],max_t[4],max_t[ntimers+2]); + } + } +#endif + + /* close the file */ + err = ncmpio_file_close(ncp); if (status == NC_NOERR) status = err; /* file is open for write and no variable has been defined */ @@ -219,6 +245,14 @@ ncmpio_close(void *ncdp) if (ncp->nprocs > 1) MPI_Barrier(ncp->comm); } + /* free the intra-node aggregation communicator */ + if (ncp->ina_comm != MPI_COMM_NULL) + MPI_Comm_free(&ncp->ina_comm); + + /* collectively return the same error code */ + if (ncp->nprocs > 1) + MPI_Allreduce(MPI_IN_PLACE, &status, 1, MPI_INT, MPI_MIN, ncp->comm); + /* free up space occupied by the header metadata */ ncmpio_free_NC(ncp); diff --git a/src/drivers/ncmpio/ncmpio_create.c b/src/drivers/ncmpio/ncmpio_create.c index e5cee83d3f..9d318ffa3b 100644 --- a/src/drivers/ncmpio/ncmpio_create.c +++ b/src/drivers/ncmpio/ncmpio_create.c @@ -8,7 +8,6 @@ * This file implements the corresponding APIs defined in src/dispatchers/file.c * * ncmpi_create() : dispatcher->create() - * ncmpi_open() : dispatcher->open() */ #ifdef HAVE_CONFIG_H @@ -35,25 +34,30 @@ /*----< ncmpio_create() >----------------------------------------------------*/ int -ncmpio_create(MPI_Comm comm, - const char *path, - int cmode, - int ncid, - MPI_Info user_info, /* user's and env info combined */ - void **ncpp) +ncmpio_create(MPI_Comm comm, + const char *path, + int cmode, + int ncid, + int env_mode, + MPI_Info user_info, /* user's and env info combined */ + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { - char *env_str, *filename, *mpi_name; + char *filename, value[MPI_MAX_INFO_VAL + 1], *mpi_name; int rank, nprocs, mpiomode, err, mpireturn, default_format, file_exist=1; - int use_trunc=1; - MPI_File fh; - MPI_Info info_used; + int use_trunc=1, flag, striping_unit; + MPI_File fh=MPI_FILE_NULL; NC *ncp=NULL; *ncpp = NULL; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &nprocs); + /* Note path's validity and cmode consistency have been checked in - * ncmpi_create() in src/dispatchers/file.c and - * path consistency will be done in MPI_File_open */ + * ncmpi_create() in src/dispatchers/file.c and path consistency will be + * done in MPI_File_open. + */ /* First, check whether cmode is valid or supported ---------------------*/ @@ -66,25 +70,62 @@ ncmpio_create(MPI_Comm comm, /* Check cmode for other illegal flags already done in dispatcher layer */ /* Get default format, in case cmode does not include either - * NC_64BIT_OFFSET or NC_64BIT_DATA */ + * NC_64BIT_OFFSET or NC_64BIT_DATA. + */ ncmpi_inq_default_format(&default_format); - /* Handle file clobber --------------------------------------------------*/ - MPI_Comm_rank(comm, &rank); - MPI_Comm_size(comm, &nprocs); + /* allocate buffer for header object NC and initialize its contents */ + ncp = (NC*) NCI_Calloc(1, sizeof(NC)); + if (ncp == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM) + + *ncpp = (void*)ncp; + + ncp->ncid = ncid; + ncp->comm = comm; /* reuse comm duplicated in dispatch layer */ + ncp->rank = rank; + ncp->nprocs = nprocs; + + /* Extract hints from user_info. Two hints must be extracted now in order + * to continue: + * nc_pncio: whether to user MPI-IO or PnetCDF's PNCIO driver. + * nc_num_aggrs_per_node: number of processes per node to be the INA + * aggregators. + * + * ncp->fstype will be initialized in ncmpio_hint_extract(), and set in + * PNCIO_FileSysType(). + */ + ncmpio_hint_extract(ncp, user_info); + + if (ncp->fstype == PNCIO_FSTYPE_CHECK && rank == 0) + /* Check file system type. If the given file does not exist, check its + * folder. Currently PnetCDF's PNCIO drivers support Lustre + * (PNCIO_LUSTRE) and Unix File System (PNCIO_UFS). + */ + ncp->fstype = PNCIO_FileSysType(path); + +#ifdef WKL_DEBUG +if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == PNCIO_FSTYPE_MPIIO)? "PNCIO_FSTYPE_MPIIO" : (ncp->fstype == PNCIO_LUSTRE) ? "PNCIO_LUSTRE" : "PNCIO_UFS"); +#endif + /* Setting file open mode in mpiomode which may later be needed in + * ncmpi_begin_indep_data() to open file for independent data mode. + */ mpiomode = MPI_MODE_RDWR | MPI_MODE_CREATE; - /* remove the file system type prefix name if there is any. For example, + /* Remove the file system type prefix name if there is any. For example, * when path = "lustre:/home/foo/testfile.nc", remove "lustre:" to make * filename pointing to "/home/foo/testfile.nc", so it can be used in POSIX - * access() below + * access() below. */ filename = ncmpii_remove_file_system_type_prefix(path); - /* Check if the file already exists, if lstat() or access() is available */ + /* In case of clobber mode, first check if the file already exists, through + * a call to lstat() or access() if they are is available. If not, we + * assume the file exists and will add some MPI flag to open mode argument + * of MPI_File_open to either delete or truncate the file first. + */ #ifdef HAVE_LSTAT - /* call lstat() to check the file if exists and if is a symbolic link */ + /* Call lstat() to check the file if exists and if is a symbolic link */ if (rank == 0) { struct stat st_buf; st_buf.st_mode = 0; @@ -92,21 +133,23 @@ ncmpio_create(MPI_Comm comm, if (lstat(filename, &st_buf) == -1) file_exist = 0; errno = 0; /* reset errno */ - /* If the file is a regular file, not a symbolic link, then we can - * delete the file first and later create it when calling - * MPI_File_open() with MPI_MODE_CREATE. It is OK to delete and then - * re-create the file if the file is a regular file. If there are other - * files symbolically linked to this file, then their links will still - * point to this file after it is re-created. + /* If the file is a regular file, not a symbolic link, then we delete + * the file first and later create it when calling MPI_File_open() with + * MPI_MODE_CREATE. If the file is a regular file, not a symbolic link, + * it is faster to delete it and then re-create the file, as truncating + * it to zero size is more expensive. * * If the file is a symbolic link, then we cannot delete the file, as - * the link will be gone. + * the link will be gone. If the file is deleted and there are other + * files symbolically linked to this file, then their links will become + * invalid. */ if (S_ISREG(st_buf.st_mode)) use_trunc = 0; } #elif defined HAVE_ACCESS - /* if access() is available, use it to check whether file already exists - * rank 0 calls access() and broadcasts file_exist */ + /* If access() is available, use it to check whether file already exists, + * by having rank 0 to call access() and broadcast file_exist. + */ if (rank == 0) { if (access(filename, F_OK) == -1) file_exist = 0; errno = 0; /* reset errno */ @@ -114,53 +157,80 @@ ncmpio_create(MPI_Comm comm, #endif if (fIsSet(cmode, NC_NOCLOBBER)) { - /* check if file exists: NC_EEXIST is returned if the file already - * exists and NC_NOCLOBBER mode is used in ncmpi_create */ + /* Error NC_EEXIST will be returned, if the file already exists and + * NC_NOCLOBBER mode is set in ncmpi_create. + */ #ifdef HAVE_ACCESS - if (nprocs > 1) - TRACE_COMM(MPI_Bcast)(&file_exist, 1, MPI_INT, 0, comm); - if (file_exist) DEBUG_RETURN_ERROR(NC_EEXIST) + if (nprocs > 1) { + int msg[2] = {file_exist, ncp->fstype}; + TRACE_COMM(MPI_Bcast)(msg, 2, MPI_INT, 0, comm); + file_exist = msg[0]; + ncp->fstype = msg[1]; + } + if (file_exist) { + NCI_Free(ncp); + DEBUG_RETURN_ERROR(NC_EEXIST) + } #else - /* add MPI_MODE_EXCL mode for MPI_File_open to check file existence */ + if (nprocs > 1) + TRACE_COMM(MPI_Bcast)(&ncp->fstype, 1, MPI_INT, 0, comm); + + /* Add MPI_MODE_EXCL mode for MPI_File_open, so it can error out, if + * the file exists. + */ fSet(mpiomode, MPI_MODE_EXCL); errno = 0; /* reset errno, as MPI_File_open may change it */ #endif } - else { /* NC_CLOBBER is the default mode in create */ - /* rank 0 truncates or deletes the file and ignores error code. - * Note calling MPI_File_set_size is expensive as it calls truncate() + else { + /* NC_CLOBBER is the default mode in ncmpi_create(). Below, rank 0 + * truncates or deletes the file and ignores error code. Note in some + * implementation of MPI-IO, calling MPI_File_set_size is expensive as + * it may call truncate() by all ranks. */ err = NC_NOERR; if (rank == 0 && file_exist) { if (!use_trunc) { /* delete the file */ #ifdef HAVE_UNLINK - /* unlink() is likely faster then truncate(), but may be still - * expensive + /* unlink() is likely faster then truncate(). However, unlink() + * can be expensive when the file size is large. For example, + * it taook 1.1061 seconds to delete a file of size 27.72 GiB + * on Perlmutter at NERSC. */ err = unlink(filename); if (err < 0 && errno != ENOENT) /* ignore ENOENT: file not exist */ - DEBUG_ASSIGN_ERROR(err, NC_EFILE) /* other error */ + DEBUG_ASSIGN_ERROR(err, NC_EFILE) /* report other error */ else err = NC_NOERR; #else err = NC_NOERR; - TRACE_IO(MPI_File_delete, ((char *)path, MPI_INFO_NULL)); - if (mpireturn != MPI_SUCCESS) { - int errorclass; - MPI_Error_class(mpireturn, &errorclass); - if (errorclass != MPI_ERR_NO_SUCH_FILE) - /* ignore file not exist */ - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + if (ncp->fstype != PNCIO_FSTYPE_MPIIO) + err = PNCIO_File_delete(filename); + else { + TRACE_IO(MPI_File_delete, (path, MPI_INFO_NULL)); + if (mpireturn != MPI_SUCCESS) { + int errorclass; + MPI_Error_class(mpireturn, &errorclass); + if (errorclass != MPI_ERR_NO_SUCH_FILE) + /* ignore file not exist */ + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + } } #endif } - else { /* file is not a regular file, truncate it to zero size */ + else { + /* If file is not a regular file (e.g. a symbolic link), we + * cannot delete it and must truncate it to zero size. In this + * case, file open mode needs to remove MPI_MODE_CREATE. + */ + mpiomode = MPI_MODE_RDWR; + #ifdef HAVE_TRUNCATE - err = truncate(filename, 0); /* can be expensive */ + err = truncate(filename, 0); /* This may be expensive */ if (err < 0 && errno != ENOENT) /* ignore ENOENT: file not exist */ - DEBUG_ASSIGN_ERROR(err, NC_EFILE) /* other error */ + DEBUG_ASSIGN_ERROR(err, NC_EFILE) /* report other error */ else err = NC_NOERR; #elif defined HAVE_OPEN @@ -173,82 +243,79 @@ ncmpio_create(MPI_Comm comm, DEBUG_ASSIGN_ERROR(err, NC_EFILE) } #else - /* call MPI_File_set_size() to truncate the file. Note this can - * be expensive. + /* When all POSIX system calls are not available, the last + * resort is to call MPI_File_set_size() to truncate the file. + * Note for some ROMIO versions that have all processes call + * truncate(), this option can be expensive. */ err = NC_NOERR; - TRACE_IO(MPI_File_open, (MPI_COMM_SELF, (char *)path, MPI_MODE_RDWR, MPI_INFO_NULL, &fh)); - if (mpireturn != MPI_SUCCESS) { - int errorclass; - MPI_Error_class(mpireturn, &errorclass); - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + if (ncp->fstype != PNCIO_FSTYPE_MPIIO) { + PNCIO_File pncio_fh; + pncio_fh = (PNCIO_File*) NCI_Calloc(1,sizeof(PNCIO_File)); + err = PNCIO_File_open(MPI_COMM_SELF, filename, + MPI_MODE_RDWR, MPI_INFO_NULL, + pncio_fh); + if (err == NC_NOERR) + PNCIO_File_set_size(pncio_fh, 0); /* can be expensive */ + else + PNCIO_File_close(&pncio_fh); + NCI_Free(pncio_fh); } else { - TRACE_IO(MPI_File_set_size, (fh, 0)); /* can be expensive */ + TRACE_IO(MPI_File_open, (MPI_COMM_SELF, path, MPI_MODE_RDWR, MPI_INFO_NULL, &fh)); if (mpireturn != MPI_SUCCESS) { int errorclass; MPI_Error_class(mpireturn, &errorclass); err = ncmpii_error_mpi2nc(mpireturn, mpi_name); } else { - TRACE_IO(MPI_File_close, (&fh)); + TRACE_IO(MPI_File_set_size, (fh, 0)); /* can be expensive */ if (mpireturn != MPI_SUCCESS) { int errorclass; MPI_Error_class(mpireturn, &errorclass); err = ncmpii_error_mpi2nc(mpireturn, mpi_name); } + else { + TRACE_IO(MPI_File_close, (&fh)); + if (mpireturn != MPI_SUCCESS) { + int errorclass; + MPI_Error_class(mpireturn, &errorclass); + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + } + } } } #endif } if (errno == ENOENT) errno = 0; /* reset errno */ } - /* all processes must wait here until file deletion is completed */ - if (nprocs > 1) - TRACE_COMM(MPI_Bcast)(&err, 1, MPI_INT, 0, comm); - if (err != NC_NOERR) return err; - } - - /* create file collectively -------------------------------------------- */ - TRACE_IO(MPI_File_open, (comm, (char *)path, mpiomode, user_info, &fh)); - if (mpireturn != MPI_SUCCESS) { -#ifndef HAVE_ACCESS - if (fIsSet(cmode, NC_NOCLOBBER)) { - /* This is the case when NC_NOCLOBBER is used in file creation and - * function access() is not available. MPI_MODE_EXCL is set in open - * mode. When MPI_MODE_EXCL is used and the file already exists, - * MPI-IO should return error class MPI_ERR_FILE_EXISTS. But, some - * MPI-IO implementations (older ROMIO) do not correctly return - * this error class. In this case, we can do the followings: check - * errno to see if it set to EEXIST. Note usually rank 0 makes the - * file open call and can be the only one having errno set. - */ - if (nprocs > 1) - TRACE_COMM(MPI_Bcast)(&errno, 1, MPI_INT, 0, comm); - if (errno == EEXIST) DEBUG_RETURN_ERROR(NC_EEXIST) - } -#endif - return ncmpii_error_mpi2nc(mpireturn, mpi_name); - /* for NC_NOCLOBBER, MPI_MODE_EXCL was added to mpiomode. If the file - * already exists, MPI-IO should return error class MPI_ERR_FILE_EXISTS - * which PnetCDF will return error code NC_EEXIST. This is checked - * inside of ncmpii_error_mpi2nc() + /* All processes must wait here until clobbering file by root process + * is completed. Note mpiomode may be changed to remove MPI_MODE_CREATE + * when the file to be clobbered is a symbolic link. */ + if (nprocs > 1) { + int msg[3] = {err, mpiomode, ncp->fstype}; + TRACE_COMM(MPI_Bcast)(&msg, 3, MPI_INT, 0, comm); + err = msg[0]; + mpiomode = msg[1]; + ncp->fstype = msg[2]; + } + if (err != NC_NOERR) return err; } - else - /* reset errno, as MPI_File_open may change it, even for MPI_SUCCESS */ - errno = 0; + /* Now file has been clobbered, i.e. deleted if it is not a symbolic link. + * If it is a symbolic link, it now has been truncated to zero size. + */ - /* get the I/O hints used/modified by MPI-IO */ - TRACE_IO(MPI_File_get_info, (fh, &info_used)); - if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, mpi_name); + ncp->path = path; /* reuse path duplicated in dispatch layer */ + ncp->pncio_fh = NULL; /* non-aggregators have NULL pncio_fh */ + ncp->mpiomode = mpiomode; + ncp->mpiinfo = MPI_INFO_NULL; - /* Now the file has been successfully created, allocate/set NC object */ + /* For file create, ignore NC_NOWRITE if set in cmode argument. */ + ncp->iomode = cmode | NC_WRITE; - /* allocate buffer for header object NC and initialize its contents */ - ncp = (NC*) NCI_Calloc(1, sizeof(NC)); - if (ncp == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM) + ncp->collective_fh = MPI_FILE_NULL; + ncp->independent_fh = MPI_FILE_NULL; /* set the file format version based on the create mode, cmode */ if (fIsSet(cmode, NC_64BIT_DATA)) ncp->format = 5; @@ -259,6 +326,7 @@ ncmpio_create(MPI_Comm comm, else ncp->format = 1; } + /* indicate this is from ncmpi_create */ fSet(ncp->flags, NC_MODE_CREATE); /* create automatically enter write mode */ fClr(ncp->flags, NC_MODE_RDONLY); @@ -267,70 +335,256 @@ ncmpio_create(MPI_Comm comm, /* PnetCDF default mode is no fill */ fClr(ncp->flags, NC_MODE_FILL); - ncp->ncid = ncid; - - /* chunk size for reading header, set to default before check hints */ - ncp->chunk = PNC_DEFAULT_CHUNKSIZE; - - /* calculate the true header size (not-yet aligned) - * No need to do this now. - * ncp->xsz = ncmpio_hdr_len_NC(ncp); - */ + /* incorporate modes set in environment variables */ + fSet(ncp->flags, env_mode); /* initialize unlimited_id as no unlimited dimension yet defined */ ncp->dims.unlimited_id = -1; - /* buffer to pack noncontiguous user buffers when calling wait() */ - ncp->ibuf_size = PNC_DEFAULT_IBUF_SIZE; - - /* Extract PnetCDF specific I/O hints from user_info and set default hint - * values into info_used. Note some MPI libraries, such as MPICH 3.3.1 and - * priors fail to preserve user hints that are not recognized by the MPI - * libraries. + /* node_ids stores a list of unique IDs of compute nodes of all MPI ranks + * in the MPI communicator passed from the user application. It is a keyval + * attribute cached in the communicator. See src/dispatchers/file.c for + * details. The node IDs will be used when the intra-node aggregation (INA) + * is enabled and when PnetCDF's PNCIO driver is used. + * + * When intra-node aggregation (INA) is enabled, node IDs are used to + * create a new MPI communicator consisting of the intra-node aggregators + * only. The communicator will be used to call file open in MPI-IO or + * PnetCDF's PNCIO driver. This means only intra-node aggregators will + * perform file I/O in PnetCDF collective put and get operations. + * + * node_ids will be used to calculate cb_nodes, the number of MPI-IO/PNCIO + * aggregators (not INA aggregators). */ - ncmpio_set_pnetcdf_hints(ncp, user_info, info_used); - - /* For file create, ignore if NC_NOWRITE set in cmode by user */ - ncp->iomode = cmode | NC_WRITE; - ncp->comm = comm; /* reuse comm duplicated in dispatch layer */ - ncp->mpiinfo = info_used; /* is not MPI_INFO_NULL */ - ncp->mpiomode = mpiomode; - ncp->rank = rank; - ncp->nprocs = nprocs; - ncp->collective_fh = fh; - ncp->independent_fh = (nprocs > 1) ? MPI_FILE_NULL : fh; - ncp->path = (char*) NCI_Malloc(strlen(path) + 1); - strcpy(ncp->path, path); - -#ifdef PNETCDF_DEBUG - /* PNETCDF_DEBUG is set at configure time, which will be overwritten by - * the run-time environment variable PNETCDF_SAFE_MODE */ - ncp->safe_mode = 1; -#endif - /* If environment variable PNETCDF_SAFE_MODE is set to 1, then we perform - * a strict consistent test, i.e. arguments used in def_dim/def_var APIs + ncp->node_ids = node_ids; + + /* When the total number of aggregators >= number of processes, disable + * intra-node aggregation. */ - if ((env_str = getenv("PNETCDF_SAFE_MODE")) != NULL) { - if (*env_str == '0') ncp->safe_mode = 0; - else ncp->safe_mode = 1; - /* if PNETCDF_SAFE_MODE is set but without a value, *env_str can - * be '\0' (null character). In this case, safe_mode is enabled */ - } + if (ncp->num_aggrs_per_node * node_ids.num_nodes >= ncp->nprocs) + ncp->num_aggrs_per_node = 0; - /* determine whether to enable intra-node aggregation and set up all - * intra-node aggregation metadata. - * ncp->num_aggrs_per_node = 0, or non-zero indicates whether this feature - * is enabled globally for all processes. - * ncp->my_aggr = -1 or >= 0 indicates whether aggregation is effectively - * enabled for the aggregation group of this process. + /* ncp->num_aggrs_per_node = 0, or > 0 is an indicator of whether the INA + * feature is disabled or enabled globally for all processes. */ ncp->my_aggr = -1; - if (ncp->num_aggrs_per_node != 0) { - err = ncmpio_intra_node_aggr_init(ncp); - if (err != NC_NOERR) return err; + ncp->ina_comm = MPI_COMM_NULL; + ncp->ina_nprocs = 0; + ncp->ina_rank = -1; + ncp->ina_node_list = NULL; + if (ncp->num_aggrs_per_node > 0) { + /* Must duplicate node_ids, as node_ids.ids[] will be modified by + * ncmpio_ina_init(). + */ + ncp->node_ids.ids = (int*) NCI_Malloc(sizeof(int) * ncp->nprocs); + memcpy(ncp->node_ids.ids, node_ids.ids, sizeof(int) * ncp->nprocs); + + /* Divide all ranks into groups. Each group is assigned one intra-node + * aggregator. The following metadata related to intra-node aggregation + * will be set up in ncmpio_ina_init(). + * ncp->my_aggr is the aggregator's rank ID (related to ncp->comm) of + * this group. When == ncp->rank, this rank is an aggregator. + * ncp->num_nonaggrs is the number of non-aggregators assigned to this + * rank (an aggregator) + * ncp->ina_comm is an MPI communicator consisting of only intra-node + * aggregators across all nodes, which will be used when calling + * MPI_File_open(). For non-aggregator, it == MPI_COMM_NULL. + * ncp->node_ids.ids[] will be modified to contain the nodes IDs of all + * intra-node aggregators, and will be passed to pncio_fh. + */ + err = ncmpio_ina_init(ncp); + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err); + + /* As non-aggregators will not perform any file I/O, we now can replace + * comm with ina_comm. Same for nprocs. + */ + comm = ncp->ina_comm; + nprocs = ncp->ina_nprocs; + + /* For non-aggregators, comm is MPI_COMM_NULL. As the remaining task of + * this subroutine is to open the file and obtain the file handler, + * non-aggregators can skip. + */ + if (comm == MPI_COMM_NULL) { + if (user_info != MPI_INFO_NULL) + MPI_Info_dup(user_info, &ncp->mpiinfo); + goto fn_exit; + } } - *ncpp = (void*)ncp; + /* create file collectively -------------------------------------------- */ + if (ncp->fstype == PNCIO_FSTYPE_MPIIO) { + /* If hint nc_striping is set to "auto" and hint striping_factor is not + * set by the user, then set hint striping_factor to ncp->num_nodes. + */ + if (ncp->nc_striping == PNCIO_STRIPING_AUTO) { + int striping_factor=0; + if (user_info != MPI_INFO_NULL) { + MPI_Info_get(user_info, "striping_factor", MPI_MAX_INFO_VAL-1, + value, &flag); + if (flag) + striping_factor = atoi(value); + } + if (striping_factor == 0) { + sprintf(value, "%d", ncp->node_ids.num_nodes); + MPI_Info_set(user_info, "striping_factor", value); + } + } + + TRACE_IO(MPI_File_open, (comm, path, mpiomode, user_info, &fh)); + if (mpireturn != MPI_SUCCESS) { +#ifndef HAVE_ACCESS + if (fIsSet(cmode, NC_NOCLOBBER)) { + /* This is the case when NC_NOCLOBBER is used in file creation + * and function access() is not available. MPI_MODE_EXCL is set + * in open mode. When MPI_MODE_EXCL is used and the file + * already exists, MPI-IO should return error class + * MPI_ERR_FILE_EXISTS. But, some MPI-IO implementations (older + * ROMIO) do not correctly return this error class. In this + * case, we can do the followings: check errno to see if it set + * to EEXIST. Note usually rank 0 makes the file open call and + * can be the only one having errno set. + */ + if (nprocs > 1) + TRACE_COMM(MPI_Bcast)(&errno, 1, MPI_INT, 0, comm); + if (errno == EEXIST) { + NCI_Free(ncp); + DEBUG_FOPEN_ERROR(NC_EEXIST) + } + } +#endif + err = ncmpii_error_mpi2nc(mpireturn, "MPI_File_open"); + DEBUG_FOPEN_ERROR(err); + /* for NC_NOCLOBBER, MPI_MODE_EXCL was added to mpiomode. If the + * file already exists, MPI-IO should return error class + * MPI_ERR_FILE_EXISTS which PnetCDF will return error code + * NC_EEXIST. This is checked inside of ncmpii_error_mpi2nc() + */ + } + else + /* reset errno, as MPI_File_open may change it, even if it returns + * MPI_SUCCESS + */ + errno = 0; + + /* Now the file has been successfully created */ + ncp->collective_fh = fh; + ncp->independent_fh = (nprocs == 1) ? fh : MPI_FILE_NULL; + + /* get the I/O hints used/modified by MPI-IO */ + TRACE_IO(MPI_File_get_info, (fh, &ncp->mpiinfo)); + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + DEBUG_FOPEN_ERROR(err); + } + } + else { + /* When ncp->fstype != PNCIO_FSTYPE_MPIIO, use PnetCDF's PNCIO driver */ + ncp->pncio_fh = (PNCIO_File*) NCI_Calloc(1, sizeof(PNCIO_File)); + ncp->pncio_fh->file_system = ncp->fstype; + ncp->pncio_fh->node_ids = ncp->node_ids; + + err = PNCIO_File_open(comm, filename, mpiomode, user_info, + ncp->pncio_fh); + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err) + + /* Now the file has been successfully created, obtain the I/O hints + * used/modified by PNCIO driver. + */ + err = PNCIO_File_get_info(ncp->pncio_fh, &ncp->mpiinfo); + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err) + } + +fn_exit: + striping_unit = -1; + + if (ncp->num_aggrs_per_node > 0) { + /* When intra-node aggregation is enabled, it is necessary to make sure + * non-aggregators obtain consistent values of file striping hints. + * + * non-aggregator do not have hints returned from MPI_File_get_info() + */ + int striping_info[2]; + if (ncp->rank == 0) { + MPI_Info_get(ncp->mpiinfo, "striping_unit", MPI_MAX_INFO_VAL-1, + value, &flag); + striping_info[0] = 0; + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + striping_info[0] = (int)strtol(value,NULL,10); + if (errno != 0) striping_info[0] = 0; + } + + MPI_Info_get(ncp->mpiinfo, "striping_factor", MPI_MAX_INFO_VAL-1, + value, &flag); + striping_info[1] = 0; + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + striping_info[1] = (int)strtol(value,NULL,10); + if (errno != 0) striping_info[1] = 0; + } + } + + MPI_Bcast(striping_info, 2, MPI_INT, 0, ncp->comm); + + if (ncp->my_aggr != ncp->rank) { + sprintf(value, "%d", striping_info[0]); + MPI_Info_set(ncp->mpiinfo, "striping_unit", value); + sprintf(value, "%d", striping_info[1]); + MPI_Info_set(ncp->mpiinfo, "striping_factor", value); + } + + striping_unit = striping_info[0]; + } + else { + MPI_Info_get(ncp->mpiinfo, "striping_unit", MPI_MAX_INFO_VAL-1, + value, &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + striping_unit = (int)strtol(value,NULL,10); + if (errno != 0) striping_unit = -1; + } + } + + if (ncp->data_chunk == -1) + /* if not set by user hint, nc_data_move_chunk_size */ + ncp->data_chunk = (striping_unit > 0) ? striping_unit + : PNC_DATA_MOVE_CHUNK_SIZE; + + /* Copy MPI-IO hints into ncp->mpiinfo */ + ncmpio_hint_set(ncp, ncp->mpiinfo); + +/* +if (ncp->rank == 0) { + int i, nkeys; + MPI_Info_get_nkeys(ncp->mpiinfo, &nkeys); + printf("%s line %d: MPI File Info: nkeys = %d\n",__func__,__LINE__,nkeys); + for (i=0; impiinfo, i, key); + MPI_Info_get_valuelen(ncp->mpiinfo, key, &valuelen, &flag); + MPI_Info_get(ncp->mpiinfo, key, valuelen+1, value, &flag); + printf("MPI File Info: [%2d] key = %25s, value = %s\n",i,key,value); + } +} +*/ + + /* ina_node_list is no longer needed */ + if (ncp->ina_node_list != NULL) { + NCI_Free(ncp->ina_node_list); + ncp->ina_node_list = NULL; + } + if (ncp->num_aggrs_per_node > 0) { + /* node_ids is no longer needed. Note node_ids is duplicated above from + * the MPI communicator's cached keyval attribute when + * ncp->num_aggrs_per_node > 0. + */ + NCI_Free(ncp->node_ids.ids); + ncp->node_ids.ids = NULL; + } + if (ncp->pncio_fh != NULL) + ncp->pncio_fh->node_ids.ids = NULL; return NC_NOERR; } diff --git a/src/drivers/ncmpio/ncmpio_dim.c b/src/drivers/ncmpio/ncmpio_dim.c index 6d44dd1c91..273f9a4940 100644 --- a/src/drivers/ncmpio/ncmpio_dim.c +++ b/src/drivers/ncmpio/ncmpio_dim.c @@ -346,7 +346,7 @@ ncmpio_rename_dim(void *ncdp, #endif err_check: - if (ncp->safe_mode && ncp->nprocs > 1) { + if (fIsSet(ncp->flags, NC_MODE_SAFE) && ncp->nprocs > 1) { /* check the error so far across processes */ int status, mpireturn; diff --git a/src/drivers/ncmpio/ncmpio_driver.h b/src/drivers/ncmpio/ncmpio_driver.h index f072b5d420..2d9924d895 100644 --- a/src/drivers/ncmpio/ncmpio_driver.h +++ b/src/drivers/ncmpio/ncmpio_driver.h @@ -12,10 +12,14 @@ #include extern int -ncmpio_create(MPI_Comm comm, const char *path, int cmode, int ncid, MPI_Info info, void **ncdp); +ncmpio_create(MPI_Comm comm, const char *path, int cmode, int ncid, + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, + void **ncdp); extern int -ncmpio_open(MPI_Comm comm, const char *path, int omode, int ncid, MPI_Info info, void **ncdp); +ncmpio_open(MPI_Comm comm, const char *path, int omode, int ncid, + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, + void **ncdp); extern int ncmpio_close(void *ncdp); diff --git a/src/drivers/ncmpio/ncmpio_enddef.c b/src/drivers/ncmpio/ncmpio_enddef.c index efc99657a6..90d648edec 100644 --- a/src/drivers/ncmpio/ncmpio_enddef.c +++ b/src/drivers/ncmpio/ncmpio_enddef.c @@ -30,12 +30,6 @@ #include "ncmpio_subfile.h" #endif -/* Divide the amount of data to be moved into chunks of size MOVE_UNIT each, - * and assign chunks to all processes. If the number of chunks is larger than - * the number of processes, carry out the data movement in multiple rounds. - */ -#define MOVE_UNIT 16777216 - #ifdef USE_POSIX_IO_TO_MOVE /*----< move_file_block() >-------------------------------------------------*/ /* Call POSIX I/O subroutines to move data */ @@ -56,19 +50,22 @@ move_file_block(NC *ncp, off_t off_last, off_from, off_to; char *path = ncmpii_remove_file_system_type_prefix(ncp->path); + /* check if this is a valid move request */ + if (to == from || nbytes == 0) return NC_NOERR; + rank = ncp->rank; nprocs = ncp->nprocs; /* buf will be used as a temporal buffer to move data in chunks, i.e. * read a chunk and later write to the new location */ - buf = NCI_Malloc(MOVE_UNIT); + buf = NCI_Malloc(ncp->data_chunk); if (buf == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM) - p_units = MOVE_UNIT * nprocs; + p_units = (size_t)ncp->data_chunk * nprocs; num_moves = nbytes / p_units; if (nbytes % p_units) num_moves++; - off_last = (num_moves - 1) * p_units + rank * MOVE_UNIT; + off_last = (num_moves - 1) * p_units + (size_t)rank * ncp->data_chunk; off_from = from + off_last; off_to = to + off_last; mv_amnt = nbytes % p_units; @@ -79,8 +76,8 @@ move_file_block(NC *ncp, if (nbytes >= p_units) do_open = 1; else { - MPI_Offset n_units = nbytes / MOVE_UNIT; - if (nbytes % MOVE_UNIT) n_units++; + MPI_Offset n_units = nbytes / ncp->data_chunk; + if (nbytes % ncp->data_chunk) n_units++; if (rank < n_units) do_open = 1; } @@ -97,16 +94,16 @@ move_file_block(NC *ncp, if (mv_amnt == p_units) { /* each rank moves amount of chunk_size */ - chunk_size = MOVE_UNIT; + chunk_size = ncp->data_chunk; } else { /* when total move amount is less than p_units */ - size_t num_chunks = mv_amnt / MOVE_UNIT; - if (mv_amnt % MOVE_UNIT) num_chunks++; + size_t num_chunks = mv_amnt / ncp->data_chunk; + if (mv_amnt % ncp->data_chunk) num_chunks++; if (rank < num_chunks) { - chunk_size = MOVE_UNIT; - if (rank == num_chunks - 1 && mv_amnt % MOVE_UNIT > 0) - chunk_size = mv_amnt % MOVE_UNIT; + chunk_size = ncp->data_chunk; + if (rank == num_chunks - 1 && mv_amnt % ncp->data_chunk > 0) + chunk_size = mv_amnt % ncp->data_chunk; assert(chunk_size > 0); } else @@ -118,8 +115,8 @@ move_file_block(NC *ncp, get_size = pread(fd, buf, chunk_size, off_from); if (get_size < 0) { fprintf(stderr, - "Error at %s line %d: pread file %s offset "OFFFMT" size %zd (%s)\n", - __func__,__LINE__,path,off_from,chunk_size,strerror(errno)); + "Error at %s line %d: pread file %s offset %lld size %zd (%s)\n", + __func__,__LINE__,path,(long long)off_from,chunk_size,strerror(errno)); DEBUG_RETURN_ERROR(NC_EREAD) } ncp->get_size += get_size; @@ -138,8 +135,8 @@ move_file_block(NC *ncp, put_size = pwrite(fd, buf, get_size, off_to); if (put_size < 0) { fprintf(stderr, - "Error at %s line %d: pwrite file %s offset "OFFFMT" size %zd (%s)\n", - __func__,__LINE__,path,off_to,get_size,strerror(errno)); + "Error at %s line %d: pwrite file %s offset %lld size %zd (%s)\n", + __func__,__LINE__,path,(long long)off_to,get_size,strerror(errno)); DEBUG_RETURN_ERROR(NC_EREAD) } ncp->put_size += put_size; @@ -167,161 +164,126 @@ move_file_block(NC *ncp, MPI_Offset from, /* source starting file offset */ MPI_Offset nbytes) /* amount to be moved */ { - char *mpi_name; - int rank, nprocs, mpireturn, err, status=NC_NOERR, do_coll; + int rank, align_rank, nprocs, status=NC_NOERR, do_coll; void *buf; - size_t num_moves, mv_amnt, p_units; - MPI_Offset off_last, off_from, off_to; - MPI_Status mpistatus; - MPI_File fh; - - rank = ncp->rank; - nprocs = ncp->nprocs; + MPI_Offset mv_amnt, p_units, end_off, end_block; + MPI_Offset off_last, off_from, off_to, rlen, wlen; + MPI_Comm comm; + PNCIO_View buf_view; - /* collective_fh can be used in either MPI independent or collective I/O - * APIs to move data, within this subroutine. + /* If intra-node aggregation is enabled, then only the aggregators perform + * the movement. */ - fh = ncp->collective_fh; + if (ncp->num_aggrs_per_node > 0 && ncp->ina_comm == MPI_COMM_NULL) + return NC_NOERR; - /* MPI-IO fileview has been reset in ncmpi_redef() to make the entire file - * visible - */ + comm = (ncp->ina_comm == MPI_COMM_NULL) ? ncp->comm : ncp->ina_comm; + rank = (ncp->ina_comm == MPI_COMM_NULL) ? ncp->rank : ncp->ina_rank; + nprocs = (ncp->ina_comm == MPI_COMM_NULL) ? ncp->nprocs : ncp->ina_nprocs; + + /* align file access for all ranks */ + align_rank = (to / ncp->data_chunk + rank) % nprocs; /* Use MPI collective I/O subroutines to move data, only if nproc > 1 and * MPI-IO hint "romio_no_indep_rw" is set to true. Otherwise, use MPI * independent I/O subroutines, as the data partitioned among processes are * not interleaved and thus need no collective I/O. */ - do_coll = (ncp->nprocs > 1 && fIsSet(ncp->flags, NC_HCOLL)); + do_coll = (nprocs > 1 && fIsSet(ncp->flags, NC_HCOLL)); + + if (!do_coll && (to == from || nbytes == 0)) return NC_NOERR; + + /* MPI-IO fileview has been reset in ncmpi_redef() to make the entire file + * visible + */ /* buf will be used as a temporal buffer to move data in chunks, i.e. * read a chunk and later write to the new location */ - buf = NCI_Malloc(MOVE_UNIT); + buf = NCI_Malloc(ncp->data_chunk); if (buf == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM) - p_units = MOVE_UNIT * nprocs; - num_moves = nbytes / p_units; - if (nbytes % p_units) num_moves++; - off_last = (num_moves - 1) * p_units + rank * MOVE_UNIT; - off_from = from + off_last; - off_to = to + off_last; - mv_amnt = nbytes % p_units; - if (mv_amnt == 0 && nbytes > 0) mv_amnt = p_units; + /* buffer used to move data is always contiguous */ + buf_view.type = MPI_BYTE; + buf_view.count = 0; + buf_view.is_contig = 1; + + /* movement must start from the last p_units toward to the 1st */ + p_units = (MPI_Offset)ncp->data_chunk * nprocs; + end_off = to + nbytes; + end_block = end_off % p_units; + off_last = end_off - end_block; + + /* align file writes for all ranks (reads will not be aligned) */ + if (align_rank < end_block / ncp->data_chunk) + mv_amnt = ncp->data_chunk; + else if (end_block % ncp->data_chunk > 0 && + align_rank == end_block / ncp->data_chunk) + mv_amnt = end_block % ncp->data_chunk; + else + mv_amnt = 0; - /* move the data section starting from its tail toward its beginning */ - while (nbytes > 0) { - int chunk_size, get_size=0; + /* set the 1st read-write pair */ + off_to = off_last + (MPI_Offset)align_rank * ncp->data_chunk; + off_from = off_to - (to - from); - if (mv_amnt == p_units) { - /* each rank moves amount of chunk_size */ - chunk_size = MOVE_UNIT; - } - else { - /* when total move amount is less than p_units */ - size_t num_chunks = mv_amnt / MOVE_UNIT; - if (mv_amnt % MOVE_UNIT) num_chunks++; - if (rank < num_chunks) { - chunk_size = MOVE_UNIT; - if (rank == num_chunks - 1 && mv_amnt % MOVE_UNIT > 0) - chunk_size = mv_amnt % MOVE_UNIT; - assert(chunk_size > 0); - } - else - chunk_size = 0; - } + if (off_from < from) + off_from = from; - /* explicitly initialize mpistatus object to 0. For zero-length read, - * MPI_Get_count may report incorrect result for some MPICH version, - * due to the uninitialized MPI_Status object passed to MPI-IO calls. - * Thus we initialize it above to work around. - */ - memset(&mpistatus, 0, sizeof(MPI_Status)); - mpireturn = MPI_SUCCESS; + if (off_to < to) { + mv_amnt -= to - off_to; + if (mv_amnt < 0) mv_amnt = 0; + off_to = to; + } - /* read from file at off_from for amount of chunk_size */ - if (do_coll) { - TRACE_IO(MPI_File_read_at_all, (fh, off_from, buf, chunk_size, - MPI_BYTE, &mpistatus)); - } - else if (chunk_size > 0) { - TRACE_IO(MPI_File_read_at, (fh, off_from, buf, chunk_size, - MPI_BYTE, &mpistatus)); - } - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (status == NC_NOERR && err == NC_EFILE) - DEBUG_ASSIGN_ERROR(status, NC_EREAD) - get_size = chunk_size; - } - else if (chunk_size > 0) { - /* for zero-length read, MPI_Get_count may report incorrect result - * for some MPICH version, due to the uninitialized MPI_Status - * object passed to MPI-IO calls. Thus we initialize it above to - * work around. See MPICH ticket: - * https://trac.mpich.org/projects/mpich/ticket/2332 - * - * Update the number of bytes read since file open. - * Because each rank reads and writes no more than one chunk_size - * at a time and chunk_size is < NC_MAX_INT, it is OK to call - * MPI_Get_count, instead of MPI_Get_count_c. - */ - MPI_Get_count(&mpistatus, MPI_BYTE, &get_size); - ncp->get_size += get_size; - } + /* pad the remaining of last p_units */ + nbytes += p_units - end_block; - /* to prevent from one rank's write run faster than other's read */ - if (ncp->nprocs > 1) MPI_Barrier(ncp->comm); + while (nbytes > 0) { + buf_view.size = mv_amnt; - /* explicitly initialize mpistatus object to 0. For zero-length read, - * MPI_Get_count may report incorrect result for some MPICH version, - * due to the uninitialized MPI_Status object passed to MPI-IO calls. - * Thus we initialize it above to work around. - */ - memset(&mpistatus, 0, sizeof(MPI_Status)); - mpireturn = MPI_SUCCESS; + /* read from file at off_from for amount of mv_amnt */ + rlen = 0; + if (do_coll) + rlen = ncmpio_file_read_at_all(ncp, off_from, buf, buf_view); + else if (mv_amnt > 0) + rlen = ncmpio_file_read_at(ncp, off_from, buf, buf_view); + if (status == NC_NOERR && rlen < 0) status = (int)rlen; + + /* to prevent from one rank's write run faster than other's read */ + if (nprocs > 1) MPI_Barrier(comm); - /* Write to new location at off_to for amount of get_size. Assuming the - * call to MPI_Get_count() above returns the actual amount of data read - * from the file, i.e. get_size. + /* Write to new location at off_to for amount of rlen, the actual read + * amount is rlen. */ - if (do_coll) { - TRACE_IO(MPI_File_write_at_all, (fh, off_to, buf, - get_size /* NOT chunk_size */, - MPI_BYTE, &mpistatus)); - } - else if (get_size > 0) { - TRACE_IO(MPI_File_write_at, (fh, off_to, buf, - get_size /* NOT chunk_size */, - MPI_BYTE, &mpistatus)); - } - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (status == NC_NOERR && err == NC_EFILE) - DEBUG_ASSIGN_ERROR(status, NC_EWRITE) - } - else if (get_size > 0) { - /* update the number of bytes written since file open. - * Because each rank reads and writes no more than one chunk_size - * at a time and chunk_size is < NC_MAX_INT, it is OK to call - * MPI_Get_count, instead of MPI_Get_count_c. - */ - int put_size; - mpireturn = MPI_Get_count(&mpistatus, MPI_BYTE, &put_size); - if (mpireturn != MPI_SUCCESS || put_size == MPI_UNDEFINED) - ncp->put_size += get_size; /* or chunk_size */ - else - ncp->put_size += put_size; - } + buf_view.size = rlen; + wlen = 0; + if (do_coll) /* even when rlen == 0, must still participate */ + wlen = ncmpio_file_write_at_all(ncp, off_to, buf, buf_view); + else if (rlen > 0) + wlen = ncmpio_file_write_at(ncp, off_to, buf, buf_view); + if (status == NC_NOERR && wlen < 0) status = (int)wlen; /* move on to the next round */ - mv_amnt = p_units; - off_from -= mv_amnt; - off_to -= mv_amnt; - nbytes -= mv_amnt; + nbytes -= p_units; + off_from -= p_units; + off_to -= p_units; + + /* mv_amnt becomes ncp->data_chunk in the 2nd and later rounds */ + mv_amnt = ncp->data_chunk; + + /* special treatment for the 1st p_units */ + if (off_to < to) { + mv_amnt -= to - off_to; + if (mv_amnt < 0) mv_amnt = 0; + off_to = to; + } + if (off_from < from) + off_from = from; } NCI_Free(buf); + return status; } #endif @@ -371,12 +333,13 @@ move_record_vars(NC *ncp, NC *old) { * ncp->numrecs ---- number of records (set only if new file) */ static int -NC_begins(NC *ncp) +NC_begins(NC *ncp, + MPI_Offset v_align, + MPI_Offset r_align) { int i, j, mpireturn; MPI_Offset end_var=0; NC_var *last = NULL; - NC_var *first_var = NULL; /* first "non-record" var */ /* For CDF-1 and 2 formats, a variable's "begin" in the header is 4 bytes. * For CDF-5, it is 8 bytes. @@ -385,7 +348,7 @@ NC_begins(NC *ncp) /* get the true header size (not header extent) */ ncp->xsz = ncmpio_hdr_len_NC(ncp); - if (ncp->safe_mode && ncp->nprocs > 1) { + if (fIsSet(ncp->flags, NC_MODE_SAFE) && ncp->nprocs > 1) { /* this consistency check is redundant as metadata is kept consistent * at all time when safe mode is on */ @@ -411,28 +374,90 @@ NC_begins(NC *ncp) if (status != NC_NOERR) DEBUG_RETURN_ERROR(status) } + if (ncp->vars.ndefined == 0) { + /* There is no variable defined, ignore alignment and set header extent + * to header size. + */ + ncp->begin_var = MAX(ncp->begin_var, ncp->xsz); + ncp->begin_rec = ncp->begin_var; + ncp->recsize = 0; + ncp->numrecs = 0; + ncp->v_align = 0; + ncp->r_align = 0; + return NC_NOERR; + } + + /* calculate a good file extent alignment size based on user hints. + * The precedence of hints: + * + 1st priority: hints set in the environment variable PNETCDF_HINTS, + * i.e. nc_var_align_size and nc_record_align_size + * e.g. PNETCDF_HINTS="nc_var_align_size=1024" + * + 2nd priority: hints set in the MPI info objects passed into calls to + * ncmpi_create() and ncmpi_open() + * e.g. MPI_Info_set("nc_var_align_size", "1024"); + * + 3rd priority: hints passed from arguments of ncmpi__enddef() + * i.e. v_align and r_align + * e.g. ncmpi__enddef(..., v_align=1024,...) + * + * Default values + * NC_DEFAULT_H_MINFREE for h_minfree + * NC_DEFAULT_V_ALIGN for v_align + * NC_DEFAULT_V_MINFREE for v_minfree + * NC_DEFAULT_R_ALIGN for r_align + */ + + /* determine header extent (alignment for the data section) */ + if (ncp->info_v_align == -1) { + /* hint nc_var_align_size is not set */ + + /* argument v_align is set by user */ + if (v_align > 0) + ncp->v_align = D_RNDUP(v_align, 4); + else + ncp->v_align = NC_DEFAULT_V_ALIGN; + } + else + ncp->v_align = D_RNDUP(ncp->info_v_align, 4); + + /* determine alignment for record variable section */ + if (ncp->info_r_align == -1) { + /* hint nc_record_align_size is not set */ + + /* argument r_align is set by user */ + if (r_align > 0) + ncp->r_align = D_RNDUP(r_align, 4); + else if (ncp->vars.ndefined > ncp->vars.num_rec_vars) + ncp->r_align = NC_DEFAULT_R_ALIGN; + else + ncp->r_align = NC_DEFAULT_V_ALIGN; + } + else + ncp->r_align = D_RNDUP(ncp->info_r_align, 4); + /* This function is called in ncmpi_enddef(), which can happen either when * creating a new file and first time call to ncmpi_enddef(), or other * case, e.g. opening an existing file, calling ncmpi_redef(), and then * ncmpi_enddef(). For the former case, ncp->begin_var == 0. For the latter - * case, ncp->begin_var must be > 0, as it is the orignial header extent. + * case, ncp->begin_var must be > 0, as it is the original header extent. * We increase begin_var only if the new header size grows out of its * original extent, or the start of variable section is not aligned as * requested by ncp->v_align. Note ncp->xsz is header size and * ncp->begin_var is header extent. Growth of header extent must also * respect the minimum header free space requested by user. */ - ncp->begin_var = MAX(ncp->begin_var, ncp->xsz + ncp->h_minfree); - /* align header extent */ - if (ncp->vars.ndefined > 0) - ncp->begin_var = D_RNDUP(ncp->begin_var, ncp->v_align); - else /* no variable defined, ignore alignment and set header extent to - * header size */ - ncp->begin_var = MAX(ncp->begin_var, ncp->xsz); + /* warrant a free space at the end of header section */ + ncp->begin_var = MAX(ncp->begin_var, ncp->xsz + ncp->h_minfree); + /* Previously begin_var may be calculated using a different h_minfree and + * v_align. Thus it can be larger than this round's calculation. + */ if (ncp->old != NULL) - assert(ncp->begin_var >= ncp->old->begin_var); + ncp->begin_var = MAX(ncp->begin_var, ncp->old->begin_var); + + /* align header extent if there are fix-sized variables */ + if (ncp->vars.ndefined > ncp->vars.num_rec_vars) + ncp->begin_var = D_RNDUP(ncp->begin_var, ncp->v_align); /* ncp->begin_var is the aligned starting file offset of the first * variable (also data section), which is the extent of file header @@ -447,8 +472,6 @@ NC_begins(NC *ncp) /* skip record variables on this pass */ if (IS_RECVAR(ncp->vars.value[i])) continue; - if (first_var == NULL) first_var = ncp->vars.value[i]; - /* for CDF-1 check if over the file size limit 32-bit integer */ if (ncp->format == 1 && end_var > NC_MAX_INT) DEBUG_RETURN_ERROR(NC_EVARSIZE) @@ -473,52 +496,43 @@ NC_begins(NC *ncp) /* end_var is the end offset of variable i */ end_var = ncp->vars.value[i]->begin + ncp->vars.value[i]->len; } + /* end_var now is pointing to the end of last fix-sized variable */ - /* end_var now is pointing to the end of last non-record variable */ + ncp->fix_end = D_RNDUP(end_var, 4); - /* only (re)calculate begin_rec if there is no sufficient space at end of - * non-record variables or if the start of record variables is not aligned - * as requested by ncp->r_align. + /* warrant a free space at the end of fix-sized variable section */ + if (ncp->vars.ndefined > ncp->vars.num_rec_vars) + ncp->begin_rec = ncp->fix_end + ncp->v_minfree; + else /* Ignore v_minfree when there is no fix-sized variable. */ + ncp->begin_rec = ncp->fix_end; + + /* Previously begin_rec may be calculated using a different v_minfree and + * r_align. Thus it can be larger than this round's calculation. */ - if (ncp->vars.ndefined > ncp->vars.num_rec_vars) { - if (ncp->begin_rec < end_var + ncp->v_minfree) - ncp->begin_rec = end_var + ncp->v_minfree; - } - else { /* if there is no fix-sized variable, ignore v_minfree */ - if (ncp->begin_rec < end_var) - ncp->begin_rec = end_var; - } + if (ncp->old != NULL) + ncp->begin_rec = MAX(ncp->begin_rec, ncp->old->begin_rec); - ncp->begin_rec = D_RNDUP(ncp->begin_rec, 4); + /* align the starting offset of record variable section */ + ncp->begin_rec = D_RNDUP(ncp->begin_rec, ncp->r_align); - /* Align the starting offset for record variable section. - * Ignore ncp->r_align, if there is no fix-sized variable. - */ - if (ncp->r_align > 1 && ncp->vars.ndefined > ncp->vars.num_rec_vars) - ncp->begin_rec = D_RNDUP(ncp->begin_rec, ncp->r_align); + /* When there is no fix_sized variable, set begin_var == begin_rec */ + if (ncp->vars.ndefined == ncp->vars.num_rec_vars) + ncp->begin_var = ncp->begin_rec; if (ncp->old != NULL) { - /* check whether the new begin_rec is smaller */ - if (ncp->begin_rec < ncp->old->begin_rec) - ncp->begin_rec = ncp->old->begin_rec; + assert(ncp->begin_var >= ncp->old->begin_var); + assert(ncp->begin_rec >= ncp->old->begin_rec); } - if (first_var != NULL) ncp->begin_var = first_var->begin; - else ncp->begin_var = ncp->begin_rec; - - end_var = ncp->begin_rec; - /* end_var now is pointing to the beginning of record variables - * note that this can be larger than the end of last non-record variable + /* Alignment r_align is only applicable to the record variable section, + * not individual record variables. */ - ncp->recsize = 0; - - /* The alignment is only applicable to the section of record variables, - * rather than individual record variables. + /* Loop through record variables and calculate the starting offset of each + * record variable. */ - - /* loop thru vars, second pass is for the 'record' vars, - * re-calculate the starting offset for each record variable */ + end_var = ncp->begin_rec; + ncp->recsize = 0; for (j=0, i=0; ivars.ndefined; i++) { if (!IS_RECVAR(ncp->vars.value[i])) /* skip non-record variables on this pass */ @@ -558,13 +572,12 @@ NC_begins(NC *ncp) last = ncp->vars.value[i]; } - /* - * for special case (Check CDF-1 and CDF-2 file format specifications.) + /* For special case (Check CDF-1 and CDF-2 file format specifications.) * "A special case: Where there is exactly one record variable, we drop the * requirement that each record be four-byte aligned, so in this case there * is no record padding." */ - if (last != NULL) { + if (last != NULL) { /* i.e. at least one record variable */ if (ncp->recsize == last->len) { /* exactly one record variable, pack value */ ncp->recsize = *last->dsizes * last->xsz; @@ -576,13 +589,15 @@ NC_begins(NC *ncp) #endif } -/* below is only needed if alignment is performed on record variables */ #if 0 - /* + /* This code block is to align individual record variable, which is no + * longer needed. + * * for special case of exactly one record variable, pack value + * + * if there is exactly one record variable, then there is no need to + * pad for alignment -- there's nothing after it. */ - /* if there is exactly one record variable, then there is no need to - * pad for alignment -- there's nothing after it */ if (last != NULL && ncp->recsize == last->len) ncp->recsize = *last->dsizes * last->xsz; #endif @@ -602,20 +617,28 @@ NC_begins(NC *ncp) static int write_NC(NC *ncp) { - char *mpi_name; - int status=NC_NOERR, mpireturn, err, is_coll; + int status=NC_NOERR, is_coll=0; MPI_Offset i, header_wlen, ntimes; - MPI_File fh; - MPI_Status mpistatus; + PNCIO_View buf_view; assert(!NC_readonly(ncp)); + buf_view.is_contig = 1; + buf_view.off = NULL; + buf_view.len = NULL; + /* Depending on whether NC_HCOLL is set, writing file header can be done * through either MPI collective or independent write call. * When * ncp->nprocs == 1, ncp->collective_fh == ncp->independent_fh + * For those ranks participating the collective MPI write call, their + * is_coll is set to 1, otherwise 0. */ - is_coll = (ncp->nprocs > 1 && fIsSet(ncp->flags, NC_HCOLL)) ? 1 : 0; - fh = ncp->collective_fh; + if (fIsSet(ncp->flags, NC_HCOLL)) { + if (ncp->num_aggrs_per_node > 0) + is_coll = (ncp->ina_nprocs > 1 && ncp->rank == ncp->my_aggr); + else + is_coll = (ncp->nprocs > 1); + } /* In NC_begins(), root's ncp->xsz and ncp->begin_var, root's header * size and extent, have been broadcast (sync-ed) among processes. @@ -673,64 +696,46 @@ write_NC(NC *ncp) /* rank 0's fileview already includes the file header */ - /* explicitly initialize mpistatus object to 0. For zero-length read, - * MPI_Get_count may report incorrect result for some MPICH version, - * due to the uninitialized MPI_Status object passed to MPI-IO calls. - * Thus we initialize it above to work around. - */ - memset(&mpistatus, 0, sizeof(MPI_Status)); - /* write the header in chunks */ offset = 0; remain = header_wlen; buf_ptr = buf; + buf_view.type = MPI_BYTE; + buf_view.count = 0; for (i=0; iput_size += bufCount; - else - ncp->put_size += put_size; - } - offset += bufCount; - buf_ptr += bufCount; - remain -= bufCount; + MPI_Offset wlen; + buf_view.size = MIN(remain, NC_MAX_INT); + if (is_coll) + wlen = ncmpio_file_write_at_all(ncp, offset, buf_ptr, buf_view); + else + wlen = ncmpio_file_write_at(ncp, offset, buf_ptr, buf_view); + if (status == NC_NOERR && wlen < 0) status = (int)wlen; + + offset += buf_view.size; + buf_ptr += buf_view.size; + remain -= buf_view.size; } NCI_Free(buf); } - else if (fIsSet(ncp->flags, NC_HCOLL)) { + else if (is_coll) { /* other processes participate the collective call */ - for (i=0; isafe_mode == 1 && ncp->nprocs > 1) { + if (fIsSet(ncp->flags, NC_MODE_SAFE) && ncp->nprocs > 1) { /* broadcast root's status, because only root writes to the file */ - int root_status = status; + int mpireturn, root_status = status; TRACE_COMM(MPI_Bcast)(&root_status, 1, MPI_INT, 0, ncp->comm); - /* root's write has failed, which is more serious than inconsistency */ - if (root_status == NC_EWRITE) DEBUG_ASSIGN_ERROR(status, NC_EWRITE) + if (mpireturn != MPI_SUCCESS) + status = ncmpii_error_mpi2nc(mpireturn, "MPI_Bcast"); + else if (root_status == NC_EWRITE) + /* root's write has failed, more serious than inconsistency */ + DEBUG_ASSIGN_ERROR(status, NC_EWRITE) } fClr(ncp->flags, NC_NDIRTY); @@ -745,15 +750,15 @@ write_NC(NC *ncp) * do not get error and proceed to the next subroutine call. */ #define CHECK_ERROR(err) { \ - if (ncp->safe_mode == 1 && ncp->nprocs > 1) { \ - int status; \ - TRACE_COMM(MPI_Allreduce)(&err, &status, 1, MPI_INT, MPI_MIN, \ + if (fIsSet(ncp->flags, NC_MODE_SAFE) && ncp->nprocs > 1) { \ + int min_err; \ + TRACE_COMM(MPI_Allreduce)(&err, &min_err, 1, MPI_INT, MPI_MIN, \ ncp->comm); \ if (mpireturn != MPI_SUCCESS) { \ err = ncmpii_error_mpi2nc(mpireturn, "MPI_Allreduce"); \ DEBUG_RETURN_ERROR(err) \ } \ - if (status != NC_NOERR) return status; \ + if (min_err != NC_NOERR) return min_err; \ } \ else if (err != NC_NOERR) \ return err; \ @@ -952,7 +957,7 @@ ncmpio_NC_check_voffs(NC *ncp) for (i=0, j=0; ivars.ndefined; i++) { NC_var *varp = ncp->vars.value[i]; if (varp->begin < ncp->xsz) { - if (ncp->safe_mode) { + if (fIsSet(ncp->flags, NC_MODE_SAFE)) { printf("Variable %s begin offset ("OFFFMT") is less than file header extent ("OFFFMT")\n", varp->name, varp->begin, ncp->xsz); } @@ -979,7 +984,7 @@ ncmpio_NC_check_voffs(NC *ncp) max_var_end = var_off_len[0].off + var_off_len[0].len; for (i=1; isafe_mode) { + if (fIsSet(ncp->flags, NC_MODE_SAFE)) { NC_var *var_cur = ncp->vars.value[var_off_len[i].ID]; NC_var *var_prv = ncp->vars.value[var_off_len[i-1].ID]; printf("Variable %s begin offset ("OFFFMT") overlaps variable %s (begin="OFFFMT", length="OFFFMT")\n", @@ -993,7 +998,7 @@ ncmpio_NC_check_voffs(NC *ncp) } if (ncp->begin_rec < max_var_end) { - if (ncp->safe_mode) + if (fIsSet(ncp->flags, NC_MODE_SAFE)) printf("Record variable section begin ("OFFFMT") is less than fixed-size variable section end ("OFFFMT")\n", ncp->begin_rec, max_var_end); NCI_Free(var_off_len); @@ -1027,7 +1032,7 @@ ncmpio_NC_check_voffs(NC *ncp) for (i=1; ivars.num_rec_vars; i++) { if (var_off_len[i].off < var_off_len[i-1].off + var_off_len[i-1].len) { - if (ncp->safe_mode) { + if (fIsSet(ncp->flags, NC_MODE_SAFE)) { NC_var *var_cur = ncp->vars.value[var_off_len[i].ID]; NC_var *var_prv = ncp->vars.value[var_off_len[i-1].ID]; printf("Variable %s begin offset ("OFFFMT") overlaps variable %s (begin="OFFFMT", length="OFFFMT")\n", @@ -1050,7 +1055,7 @@ ncmpio_NC_check_voffs(NC *ncp) if (IS_RECVAR(varp)) continue; if (varp->begin < prev_off) { - if (ncp->safe_mode) { + if (fIsSet(ncp->flags, NC_MODE_SAFE)) { if (i == 0) printf("Variable \"%s\" begin offset ("OFFFMT") is less than header extent ("OFFFMT")\n", varp->name, varp->begin, prev_off); @@ -1065,7 +1070,7 @@ ncmpio_NC_check_voffs(NC *ncp) } if (ncp->begin_rec < prev_off) { - if (ncp->safe_mode) + if (fIsSet(ncp->flags, NC_MODE_SAFE)) printf("Record variable section begin offset ("OFFFMT") is less than fixed-size variable section end offset ("OFFFMT")\n", ncp->begin_rec, prev_off); DEBUG_RETURN_ERROR(NC_ENOTNC) @@ -1082,7 +1087,7 @@ ncmpio_NC_check_voffs(NC *ncp) if (!IS_RECVAR(varp)) continue; if (varp->begin < prev_off) { - if (ncp->safe_mode) { + if (fIsSet(ncp->flags, NC_MODE_SAFE)) { printf("Variable \"%s\" begin offset ("OFFFMT") is less than previous variable end offset ("OFFFMT")\n", varp->name, varp->begin, prev_off); if (i == 0) @@ -1102,6 +1107,7 @@ ncmpio_NC_check_voffs(NC *ncp) return NC_NOERR; } +#if 0 /*----< read_hints() >-------------------------------------------------------*/ /* check only the following hints set in environment variable PNETCDF_HINTS or * MPI_Info object passed to ncmpi_create() and ncmpi_open(). @@ -1120,7 +1126,7 @@ read_hints(NC *ncp) /* get hints from the environment variable PNETCDF_HINTS, a string of * hints separated by ";" and each hint is in the form of hint=value. E.g. - * "cb_nodes=16;cb_config_list=*:6". If this environment variable is set, + * "cb_nodes=16;romio_ds_write=true". If this environment variable is set, * it overrides the same hints that were set by MPI_Info_set() called in * the application program. */ @@ -1183,6 +1189,7 @@ read_hints(NC *ncp) /* return no error as all hints are advisory */ } +#endif /*----< ncmpio__enddef() >---------------------------------------------------*/ /* This is a collective subroutine. @@ -1203,7 +1210,7 @@ ncmpio__enddef(void *ncdp, MPI_Offset v_minfree, MPI_Offset r_align) { - int i, num_fix_vars, mpireturn, err=NC_NOERR, status=NC_NOERR; + int i, mpireturn, err=NC_NOERR, status=NC_NOERR; char value[MPI_MAX_INFO_VAL]; MPI_Offset saved_begin_var; NC *ncp = (NC*)ncdp; @@ -1217,117 +1224,22 @@ ncmpio__enddef(void *ncdp, * called from ncmpio_enddef(). */ - /* check hints from environment variable PNETCDF_HINTS, or MPI info */ - read_hints(ncp); + /* Checking hints from environment variable PNETCDF_HINTS and MPI info has + * been done at the dispatcher, combine_env_hints(), when create/open the + * file. Calling read_hints(ncp); is no longer necessary here. + */ /* sanity check for NC_ENOTINDEFINE, NC_EINVAL, NC_EMULTIDEFINE_FNC_ARGS * has been done at dispatchers */ ncp->h_minfree = (h_minfree < 0) ? NC_DEFAULT_H_MINFREE : h_minfree; ncp->v_minfree = (v_minfree < 0) ? NC_DEFAULT_V_MINFREE : v_minfree; - /* calculate a good file extent alignment size based on user hints. - * The precedence of hints: - * + 1st priority: hints set in the environment variable PNETCDF_HINTS, - * i.e. nc_var_align_size and nc_record_align_size - * e.g. PNETCDF_HINTS="nc_var_align_size=1024" - * + 2nd priority: hints set in the MPI info objects passed into calls to - * ncmpi_create() and ncmpi_open() - * e.g. MPI_Info_set("nc_var_align_size", "1024"); - * + 3rd priority: hints passed from arguments of ncmpi__enddef() - * i.e. v_align and r_align - * e.g. ncmpi__enddef(..., v_align=1024,...) - * - * Default values - * NC_DEFAULT_H_MINFREE for h_minfree - * NC_DEFAULT_V_ALIGN for v_align - * NC_DEFAULT_V_MINFREE for v_minfree - * NC_DEFAULT_R_ALIGN for r_align - */ - - num_fix_vars = ncp->vars.ndefined - ncp->vars.num_rec_vars; - - /* determine header extent (alignment for the data section) */ - if (ncp->env_v_align == -1) { - /* hint nc_var_align_size is not set in PNETCDF_HINTS */ - ncp->v_align = -1; - - if (num_fix_vars == 0 && ncp->env_r_align != -1) - /* if no fix-sizes variable, try use env_r_align */ - ncp->v_align = ncp->env_r_align; - - if (ncp->v_align < 0) { /* ncp->v_align is still not set */ - if (ncp->info_v_align >= 0) - /* use hint set in MPI info passed to ncmpi_create/ncmpi_open */ - ncp->v_align = ncp->info_v_align; - else if (v_align >= 0) - /* valid v_align is passed from ncmpi__enddef */ - ncp->v_align = v_align; - } - - if (ncp->v_align < 0) { /* ncp->v_align is still not set */ - if (ncp->old != NULL) - /* if enter from redefine mode, reuse one set in old header */ - ncp->v_align = ncp->old->v_align; - else /* default */ - ncp->v_align = NC_DEFAULT_V_ALIGN; - } - } - else /* hint nc_var_align_size is set in PNETCDF_HINTS, use it and - * ignore v_align passed from ncmpi__enddef(). - */ - ncp->v_align = ncp->env_v_align; - - /* determine alignment for record variable section */ - if (ncp->env_r_align == -1) { - /* hint nc_record_align_size is not set in PNETCDF_HINTS */ - ncp->r_align = -1; - - if (ncp->info_r_align >= 0) - /* use hint set in MPI info passed to ncmpi_create/ncmpi_open */ - ncp->r_align = ncp->info_r_align; - else if (r_align >= 0) - /* valid r_align is passed from ncmpi__enddef */ - ncp->r_align = r_align; - - if (ncp->r_align == -1) { /* ncp->r_align is still not set */ - if (ncp->old != NULL) - /* reuse one set in old header */ - ncp->r_align = ncp->old->r_align; - else - ncp->r_align = NC_DEFAULT_R_ALIGN; - } - } - else - /* hint nc_record_align_size is set in PNETCDF_HINTS, use it and - * ignore r_align passed from ncmpi__enddef(). - */ - ncp->r_align = ncp->env_r_align; - - /* all CDF formats require 4-bytes alignment */ - if (ncp->v_align == 0) ncp->v_align = 4; - else ncp->v_align = D_RNDUP(ncp->v_align, 4); - if (ncp->r_align == 0) ncp->r_align = 4; - else ncp->r_align = D_RNDUP(ncp->r_align, 4); - - /* reflect the hint changes to the MPI info object, so the user can inquire - * what the true hint values are being used - */ - sprintf(value, OFFFMT, ncp->v_align); - MPI_Info_set(ncp->mpiinfo, "nc_var_align_size", value); - sprintf(value, OFFFMT, ncp->r_align); - MPI_Info_set(ncp->mpiinfo, "nc_record_align_size", value); - #ifdef ENABLE_SUBFILING - sprintf(value, "%d", ncp->num_subfiles); - MPI_Info_set(ncp->mpiinfo, "nc_num_subfiles", value); if (ncp->num_subfiles > 1) { /* TODO: should return subfile-related msg when there's an error */ err = ncmpio_subfile_partition(ncp); CHECK_ERROR(err) } -#else - MPI_Info_set(ncp->mpiinfo, "pnetcdf_subfiling", "disable"); - MPI_Info_set(ncp->mpiinfo, "nc_num_subfiles", "0"); #endif /* check whether sizes of all variables are legal */ @@ -1344,12 +1256,12 @@ ncmpio__enddef(void *ncdp, * all processes. */ saved_begin_var = ncp->begin_var; - err = NC_begins(ncp); + err = NC_begins(ncp, v_align, r_align); if (err != NC_NOERR) /* restore the original begin_var when failed */ ncp->begin_var = saved_begin_var; CHECK_ERROR(err) - if (ncp->safe_mode) { + if (fIsSet(ncp->flags, NC_MODE_SAFE)) { /* check whether variable begins are in an increasing order. * This check is for debugging purpose. */ err = ncmpio_NC_check_voffs(ncp); @@ -1359,10 +1271,10 @@ ncmpio__enddef(void *ncdp, #ifdef ENABLE_SUBFILING if (ncp->num_subfiles > 1) { /* get ncp info for the subfile */ - err = NC_begins(ncp->ncp_sf); + err = NC_begins(ncp->ncp_sf, v_align, r_align); CHECK_ERROR(err) - if (ncp->safe_mode) { + if (fIsSet(ncp->flags, NC_MODE_SAFE)) { /* check whether variable begins are in an increasing order. * This check is for debugging purpose. */ err = ncmpio_NC_check_voffs(ncp->ncp_sf); @@ -1373,8 +1285,9 @@ ncmpio__enddef(void *ncdp, if (ncp->old != NULL && ncp->vars.ndefined > 0) { /* The current define mode was entered from ncmpi_redef, not from - * ncmpi_create. We must check if header extent has grown. - * This only needs to be done when there are variables defined. + * ncmpi_create. We must individually check if the three sections of + * header, fix-sized, and record variables have grown. This is only + * required when there are variables defined. */ int mov_done=0; MPI_Offset nbytes; @@ -1387,111 +1300,64 @@ ncmpio__enddef(void *ncdp, /* ncp->numrecs has already sync-ed in ncmpi_redef */ - if (ncp->begin_var > ncp->old->begin_var && - ncp->begin_rec - ncp->begin_var == - ncp->old->begin_rec - ncp->old->begin_var && - ncp->vars.num_rec_vars == ncp->old->vars.num_rec_vars) { - /* When header extent grows, if the distance between the starting - * offsets of fix-sized and record variable sections remains the - * same, and no new record variable has been added, then the entire - * data section can be moved as a single contiguous block to a - * higher file offset. - */ - - /* Make sure all processes finish their I/O before any process - * starts to read the data section. - */ - if (ncp->nprocs > 1) MPI_Barrier(ncp->comm); + /* Make sure all processes finish their I/O before any process starts + * to read the data section. + */ + if (ncp->nprocs > 1) MPI_Barrier(ncp->comm); - /* amount of data section to be moved */ - nbytes = ncp->old->begin_rec - ncp->old->begin_var - + ncp->old->recsize * ncp->old->numrecs; + mov_done = 0; - err = move_file_block(ncp, ncp->begin_var, ncp->old->begin_var, - nbytes); - if (status == NC_NOERR) status = err; - mov_done = 1; - } - else { - if (ncp->begin_rec > ncp->old->begin_rec) { - /* beginning of record variable section grows. The entire - * record variable section must be moved to a higher file - * offset. + /* move record variable section first */ + if (ncp->begin_rec > ncp->old->begin_rec || + ncp->vars.num_rec_vars > ncp->old->vars.num_rec_vars) { + /* It is possible begin_rec remain the same after adding new record + * variables, e.g. when both header extent and fix-sized variable + * section did not grow. + */ + if (ncp->vars.num_rec_vars == ncp->old->vars.num_rec_vars) { + /* No new record variable is added. Move the entire record + * variable section as a single data chunk. */ + nbytes = ncp->old->recsize * ncp->old->numrecs; - /* Make sure all processes finish their I/O before any process - * starts to read the data section. - */ - if (ncp->nprocs > 1) MPI_Barrier(ncp->comm); - - if (ncp->vars.num_rec_vars == ncp->old->vars.num_rec_vars) { - /* no new record variable has been added, then the entire - * record variable section can be moved as a single - * contiguous block - */ - - /* amount of data to be moved */ - nbytes = ncp->old->recsize * ncp->old->numrecs; - - err = move_file_block(ncp, ncp->begin_rec, - ncp->old->begin_rec, nbytes); - if (status == NC_NOERR) status = err; - } - else { - /* new record variables have been added. Must move one - * record at a time, because all records of record - * variables are stored interleaved in the file. - */ - err = move_record_vars(ncp, ncp->old); - if (status == NC_NOERR) status = err; - } - mov_done = 1; + err = move_file_block(ncp, ncp->begin_rec, ncp->old->begin_rec, + nbytes); + if (status == NC_NOERR) status = err; } + else { + /* Move one record variable at a time */ + err = move_record_vars(ncp, ncp->old); + if (status == NC_NOERR) status = err; + } + mov_done = 1; + } - if (ncp->begin_var > ncp->old->begin_var) { - /* beginning of fix-sized variable section grows. The fix-sized - * variable section must be moved to a higher file offset. - */ + /* Move fix-sized variable section when starting offset grows and there + * are fix-sized variables defined. + */ + if (ncp->begin_var > ncp->old->begin_var && + ncp->old->vars.ndefined > ncp->old->vars.num_rec_vars) { - /* Make sure all processes finish their I/O before any process - * starts to read the data section. - */ - if (!mov_done && ncp->nprocs > 1) MPI_Barrier(ncp->comm); + nbytes = ncp->old->fix_end - ncp->old->begin_var; - /* First, find the size of fix-sized variable section, i.e. - * from the last fix-sized variable's begin and len. Note there - * may be some free space at the end of fix-sized variable - * section that need not be moved. - */ - MPI_Offset end_var = ncp->old->begin_var; - for (i=ncp->old->vars.ndefined-1; i>=0; i--) { - if (!IS_RECVAR(ncp->old->vars.value[i])) { - end_var = ncp->old->vars.value[i]->begin - + ncp->old->vars.value[i]->len; - break; - } - } - /* amount of data to be moved */ - nbytes = end_var - ncp->old->begin_var; - - err = move_file_block(ncp, ncp->begin_var, ncp->old->begin_var, - nbytes); - if (status == NC_NOERR) status = err; - mov_done = 1; - } + err = move_file_block(ncp, ncp->begin_var, ncp->old->begin_var, + nbytes); + if (status == NC_NOERR) status = err; + mov_done = 1; } - /* to prevent some ranks run faster than others and start to read - * after exiting ncmpi_enddef(), while some processes are still moving - * the data section + /* To prevent some ranks run faster than others and start to read after + * exiting ncmpi_enddef(), while some processes are still moving the + * data section */ if (mov_done && ncp->nprocs > 1) MPI_Barrier(ncp->comm); } /* ... ncp->old != NULL */ /* first sync header objects in memory across all processes, and then root - * writes the header to file. Note safe_mode error check will be done in - * write_NC() */ + * writes the header to file. Note safe mode error check will be done in + * write_NC(). + */ status = write_NC(ncp); /* we should continue to exit define mode, even if header is inconsistent @@ -1525,6 +1391,24 @@ ncmpio__enddef(void *ncdp, fClr(ncp->ncp_sf->flags, NC_MODE_CREATE | NC_MODE_DEF); #endif + if (ncp->mpiinfo != MPI_INFO_NULL) { + /* reflect the hint changes to the MPI info object, so the user can + * inquire what the true hint values are being used + */ + sprintf(value, OFFFMT, ncp->v_align); + MPI_Info_set(ncp->mpiinfo, "nc_var_align_size", value); + sprintf(value, OFFFMT, ncp->r_align); + MPI_Info_set(ncp->mpiinfo, "nc_record_align_size", value); + +#ifdef ENABLE_SUBFILING + sprintf(value, "%d", ncp->num_subfiles); + MPI_Info_set(ncp->mpiinfo, "nc_num_subfiles", value); +#else + MPI_Info_set(ncp->mpiinfo, "pnetcdf_subfiling", "disable"); + MPI_Info_set(ncp->mpiinfo, "nc_num_subfiles", "0"); +#endif + } + return status; } diff --git a/src/drivers/ncmpio/ncmpio_file_io.c b/src/drivers/ncmpio/ncmpio_file_io.c index 681cfd599e..db040ec54a 100644 --- a/src/drivers/ncmpio/ncmpio_file_io.c +++ b/src/drivers/ncmpio/ncmpio_file_io.c @@ -17,313 +17,917 @@ #include #include "ncmpio_NC.h" -/*----< ncmpio_read_write() >------------------------------------------------*/ -int -ncmpio_read_write(NC *ncp, - int rw_flag, /* NC_REQ_WR or NC_REQ_RD */ - int coll_indep, /* NC_REQ_COLL or NC_REQ_INDEP */ - MPI_Offset offset, - MPI_Offset buf_count, - MPI_Datatype buf_type, - void *buf, - int buftype_is_contig) +/*----< get_count() >--------------------------------------------------------*/ +/* This subroutine is independent. On success, the number of bytes read/written + * is returned (zero indicates nothing was read/written). Like POSIX read()/ + * write(), it is not an error if this number is smaller than the number of + * bytes requested. On error, a negative value, an NC error code, is returned. + */ +static +MPI_Offset get_count(MPI_Status *mpistatus, + MPI_Datatype datatype) { - char *mpi_name; - int status=NC_NOERR, err=NC_NOERR, mpireturn; - MPI_Status mpistatus; - MPI_File fh; - MPI_Offset req_size; + int mpireturn; + + if (datatype == MPI_DATATYPE_NULL) return 0; #ifdef HAVE_MPI_TYPE_SIZE_C - MPI_Count btype_size; + MPI_Count type_size; /* MPI_Type_size_c is introduced in MPI 4.0 */ - mpireturn = MPI_Type_size_c(buf_type, &btype_size); - mpi_name = "MPI_Type_size_c"; + MPI_Type_size_c(datatype, &type_size); #elif defined(HAVE_MPI_TYPE_SIZE_X) - MPI_Count btype_size; + MPI_Count type_size; /* MPI_Type_size_x is introduced in MPI 3.0 */ - mpireturn = MPI_Type_size_x(buf_type, &btype_size); - mpi_name = "MPI_Type_size_x"; + MPI_Type_size_x(datatype, &type_size); #else - int btype_size; - mpireturn = MPI_Type_size(buf_type, &btype_size); - mpi_name = "MPI_Type_size"; + int type_size; + MPI_Type_size(datatype, &type_size); #endif - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - /* return the first encountered error if there is any */ - err = (err == NC_EFILE) ? NC_EREAD : err; - } - else if (btype_size == MPI_UNDEFINED) { -#ifdef PNETCDF_DEBUG - fprintf(stderr,"%d: %s line %d: btype_size MPI_UNDEFINED buf_count="OFFFMT"\n", - ncp->rank, __func__,__LINE__,buf_count); + +#ifdef HAVE_MPI_GET_COUNT_C + MPI_Count count; + mpireturn = MPI_Get_count_c(mpistatus, datatype, &count); +#else + int count; + mpireturn = MPI_Get_count(mpistatus, datatype, &count); #endif - DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) - } - if (err != NC_NOERR) { - if (coll_indep == NC_REQ_COLL) { - DEBUG_ASSIGN_ERROR(status, err) - /* write nothing, but participate the collective call */ - buf_count = 0; - } - else - DEBUG_RETURN_ERROR(err) - } + if (mpireturn != MPI_SUCCESS || count == MPI_UNDEFINED) + /* In case of partial read/write, MPI_Get_elements() is supposed to be + * called to obtain the number of type map elements actually + * read/written in order to calculate the true read/write amount. Below + * skips this step and simply returns the partial read/write amount. + * See an example usage of MPI_Get_count() in Example 5.12 from MPI + * standard document. + */ + return NC_EFILE; - /* request size in bytes, may be > NC_MAX_INT */ - req_size = buf_count * btype_size; + return (MPI_Offset)count * type_size; +} - /* explicitly initialize mpistatus object to 0. For zero-length read, +/*----< ncmpio_file_read_at() >----------------------------------------------*/ +/* + * This function is independent. + */ +/* TODO: move check count against MAX_INT and call _c API */ +MPI_Offset +ncmpio_file_read_at(NC *ncp, + MPI_Offset offset, + void *buf, + PNCIO_View buf_view) +{ + int err=NC_NOERR, mpireturn; + MPI_Offset amnt=0; + MPI_Status mpistatus; + + /* explicitly initialize mpistatus object to 0. For zero-length read/write, * MPI_Get_count may report incorrect result for some MPICH version, * due to the uninitialized MPI_Status object passed to MPI-IO calls. - * Thus we initialize it above to work around. + * Thus we initialize it above to work around. See MPICH ticket: + * https://trac.mpich.org/projects/mpich/ticket/2332 */ memset(&mpistatus, 0, sizeof(MPI_Status)); - if (coll_indep == NC_REQ_COLL) - fh = ncp->collective_fh; - else - fh = ncp->independent_fh; + if (ncp->fstype == PNCIO_FSTYPE_MPIIO) { + char *mpi_name; + MPI_File fh; - if (rw_flag == NC_REQ_RD) { - void *xbuf=buf; - MPI_Datatype xbuf_type=buf_type; + fh = fIsSet(ncp->flags, NC_MODE_INDEP) + ? ncp->independent_fh : ncp->collective_fh; + + if (fh == MPI_FILE_NULL) return 0; #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count xlen = (MPI_Count)buf_count; + MPI_Count count = (buf_view.is_contig) ? buf_view.size : 1; + + TRACE_IO(MPI_File_read_at_c, (fh, offset, buf, count, buf_view.type, + &mpistatus)); #else - int xlen = (int)buf_count; + int count = (buf_view.is_contig) ? buf_view.size : 1; - if (buf_count > NC_MAX_INT) { - if (coll_indep == NC_REQ_COLL) { + if (buf_view.size > NC_MAX_INT) { #ifdef PNETCDF_DEBUG - fprintf(stderr,"%d: %s line %d: NC_EINTOVERFLOW buf_count="OFFFMT"\n", - ncp->rank, __func__,__LINE__,buf_count); + fprintf(stderr,"%d: %s line %d: NC_EINTOVERFLOW buffer size="OFFFMT"\n", + ncp->rank, __func__,__LINE__,buf_view.size); #endif - DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW) - /* write nothing, but participate the collective call */ - xlen = 0; - } - else - DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) + DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) } + TRACE_IO(MPI_File_read_at, (fh, offset, buf, count, buf_view.type, + &mpistatus)); #endif + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + if (err == NC_EFILE) DEBUG_ASSIGN_ERROR(err, NC_EREAD) + } + + /* update the number of bytes read since file open */ + if (err == NC_NOERR) + amnt = get_count(&mpistatus, buf_view.type); + } + else if (ncp->pncio_fh != NULL) + amnt = PNCIO_File_read_at(ncp->pncio_fh, offset, buf, buf_view); + + /* update the number of bytes read since file open */ + if (amnt >= 0) ncp->get_size += amnt; + /* else: ignore if error, as this error is not fatal */ + + return amnt; +} + +/*----< ncmpio_file_read_at_all() >------------------------------------------*/ +/* + * This function is collective. + */ +MPI_Offset +ncmpio_file_read_at_all(NC *ncp, + MPI_Offset offset, + void *buf, + PNCIO_View buf_view) +{ + int err=NC_NOERR, mpireturn; + MPI_Offset amnt=0; + MPI_Status mpistatus; + + /* Explicitly initialize mpistatus object to 0. For zero-length read/write, + * MPI_Get_count may report incorrect result for some MPICH version, + * due to the uninitialized MPI_Status object passed to MPI-IO calls. + * Thus we initialize it above to work around. See MPICH ticket: + * https://trac.mpich.org/projects/mpich/ticket/2332 + */ + memset(&mpistatus, 0, sizeof(MPI_Status)); + + if (ncp->fstype == PNCIO_FSTYPE_MPIIO) { + char *mpi_name; + MPI_File fh; + + fh = fIsSet(ncp->flags, NC_MODE_INDEP) + ? ncp->independent_fh : ncp->collective_fh; + + if (fh == MPI_FILE_NULL) return 0; - if (xlen > 0 && !buftype_is_contig && req_size <= ncp->ibuf_size) { - /* if read buffer is noncontiguous and size is < ncp->ibuf_size, - * allocate a temporary buffer and use it to read, as some MPI, - * e.g. Cray on KNL, can be significantly slow when read buffer is - * noncontiguous. - */ #ifdef HAVE_MPI_LARGE_COUNT - xbuf_type = MPI_BYTE; - xlen = (MPI_Count)req_size; + MPI_Count count = (buf_view.is_contig) ? buf_view.size : 1; + + TRACE_IO(MPI_File_read_at_all_c, (fh, offset, buf, count, + buf_view.type, &mpistatus)); #else - if (req_size > NC_MAX_INT) { - mpireturn = MPI_Type_contiguous(xlen, buf_type, &xbuf_type); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_contiguous"); - if (coll_indep == NC_REQ_COLL) - DEBUG_ASSIGN_ERROR(status, err) - else - DEBUG_RETURN_ERROR(err) - } - MPI_Type_commit(&xbuf_type); - xlen = 1; - } - else { - xbuf_type = MPI_BYTE; - xlen = (int)req_size; - } + int count = (buf_view.is_contig) ? buf_view.size : 1; + + if (buf_view.size > NC_MAX_INT) { +#ifdef PNETCDF_DEBUG + fprintf(stderr,"%d: %s line %d: NC_EINTOVERFLOW buffer size="OFFFMT"\n", + ncp->rank, __func__,__LINE__,buf_view.size); #endif - xbuf = NCI_Malloc((size_t)req_size); + DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) + /* participate the collective call, but read nothing */ + count = 0; + } + TRACE_IO(MPI_File_read_at_all, (fh, offset, buf, count, + buf_view.type, &mpistatus)); +#endif + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + if (err == NC_EFILE) DEBUG_ASSIGN_ERROR(err, NC_EREAD) } - if (ncp->nprocs > 1 && coll_indep == NC_REQ_COLL) { + /* update the number of bytes read since file open */ + if (err == NC_NOERR) + amnt = get_count(&mpistatus, buf_view.type); + } + else if (ncp->pncio_fh != NULL) + amnt = PNCIO_File_read_at_all(ncp->pncio_fh, offset, buf, buf_view); + + /* update the number of bytes read since file open */ + if (amnt >= 0) ncp->get_size += amnt; + /* else: ignore if error, as this error is not fatal */ + + return amnt; +} + +/*----< ncmpio_file_write_at() >---------------------------------------------*/ +/* + * This function is independent. + */ +MPI_Offset +ncmpio_file_write_at(NC *ncp, + MPI_Offset offset, + const void *buf, + PNCIO_View buf_view) +{ + int err=NC_NOERR, mpireturn; + MPI_Offset amnt=0; + MPI_Status mpistatus; + + /* Explicitly initialize mpistatus object to 0. For zero-length read/write, + * MPI_Get_count may report incorrect result for some MPICH version, + * due to the uninitialized MPI_Status object passed to MPI-IO calls. + * Thus we initialize it above to work around. See MPICH ticket: + * https://trac.mpich.org/projects/mpich/ticket/2332 + */ + memset(&mpistatus, 0, sizeof(MPI_Status)); + + if (ncp->fstype == PNCIO_FSTYPE_MPIIO) { + char *mpi_name; + MPI_File fh; + + fh = fIsSet(ncp->flags, NC_MODE_INDEP) + ? ncp->independent_fh : ncp->collective_fh; + + if (fh == MPI_FILE_NULL) return 0; + #ifdef HAVE_MPI_LARGE_COUNT - TRACE_IO(MPI_File_read_at_all_c, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus)); + MPI_Count count = (buf_view.is_contig) ? buf_view.size : 1; + + TRACE_IO(MPI_File_write_at_c, (fh, offset, buf, count, buf_view.type, + &mpistatus)); #else - TRACE_IO(MPI_File_read_at_all, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus)); + int count = (buf_view.is_contig) ? buf_view.size : 1; + + if (buf_view.size > NC_MAX_INT) { +#ifdef PNETCDF_DEBUG + fprintf(stderr,"%d: %s line %d: NC_EINTOVERFLOW buffer size="OFFFMT"\n", + ncp->rank, __func__,__LINE__,buf_view.size); #endif - } else { + DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) + } + TRACE_IO(MPI_File_write_at, (fh, offset, buf, count, buf_view.type, + &mpistatus)); +#endif + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + if (err == NC_EFILE) DEBUG_ASSIGN_ERROR(err, NC_EWRITE) + } + + if (err == NC_NOERR) + amnt = get_count(&mpistatus, buf_view.type); + } + else if (ncp->pncio_fh != NULL) + amnt = PNCIO_File_write_at(ncp->pncio_fh, offset, buf, buf_view); + + /* update the number of bytes written since file open */ + if (amnt >= 0) ncp->put_size += amnt; + /* else: ignore if error, as this error is not fatal */ + + return amnt; +} + +/*----< ncmpio_file_write_at_all() >-----------------------------------------*/ +/* + * This function is collective. + */ +MPI_Offset +ncmpio_file_write_at_all(NC *ncp, + MPI_Offset offset, + const void *buf, + PNCIO_View buf_view) +{ + int err=NC_NOERR, mpireturn; + MPI_Offset amnt=0; + MPI_Status mpistatus; + + /* explicitly initialize mpistatus object to 0. For zero-length read/write, + * MPI_Get_count may report incorrect result for some MPICH version, + * due to the uninitialized MPI_Status object passed to MPI-IO calls. + * Thus we initialize it above to work around. See MPICH ticket: + * https://trac.mpich.org/projects/mpich/ticket/2332 + */ + memset(&mpistatus, 0, sizeof(MPI_Status)); + + if (ncp->fstype == PNCIO_FSTYPE_MPIIO) { + char *mpi_name; + MPI_File fh; + + fh = fIsSet(ncp->flags, NC_MODE_INDEP) + ? ncp->independent_fh : ncp->collective_fh; + + if (fh == MPI_FILE_NULL) return 0; + #ifdef HAVE_MPI_LARGE_COUNT - TRACE_IO(MPI_File_read_at_c, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus)); + MPI_Count count = (buf_view.is_contig) ? buf_view.size : 1; + + TRACE_IO(MPI_File_write_at_all_c, (fh, offset, buf, count, + buf_view.type, &mpistatus)); #else - TRACE_IO(MPI_File_read_at, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus)); + int count = (buf_view.is_contig) ? buf_view.size : 1; + + if (buf_view.size > NC_MAX_INT) { +#ifdef PNETCDF_DEBUG + fprintf(stderr,"%d: %s line %d: NC_EINTOVERFLOW buffer size="OFFFMT"\n", + ncp->rank, __func__,__LINE__,buf_view.size); #endif + DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) + /* participate the collective call, but write nothing */ + count = 0; } + TRACE_IO(MPI_File_write_at_all, (fh, offset, buf, count, + buf_view.type, &mpistatus)); +#endif if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - /* return the first encountered error if there is any */ - if (status == NC_NOERR) { - err = (err == NC_EFILE) ? NC_EREAD : err; - DEBUG_ASSIGN_ERROR(status, err) - } + if (err == NC_EFILE) DEBUG_ASSIGN_ERROR(err, NC_EWRITE) + } + + if (err == NC_NOERR) + amnt = get_count(&mpistatus, buf_view.type); + } + else if (ncp->pncio_fh != NULL) + amnt = PNCIO_File_write_at_all(ncp->pncio_fh, offset, buf, buf_view); + + /* update the number of bytes written since file open */ + if (amnt >= 0) ncp->put_size += amnt; + /* else: ignore if error, as this error is not fatal */ + + return amnt; +} + +/*----< ncmpio_getput_zero_req() >-------------------------------------------*/ +/* This function is called when this process has zero-length I/O request and + * must participate all the MPI collective calls involved in the collective + * APIs and wait_all(), which include setting fileview, collective read/write, + * another setting fileview. + * + * This function is collective. + */ +int +ncmpio_getput_zero_req(NC *ncp, int reqMode) +{ + int err, status=NC_NOERR; + MPI_Offset rlen, wlen; + PNCIO_View buf_view; + + buf_view.size = 0; + + /* When intra-node aggregation is enabled, non-aggregators do not access + * the file. + */ + if (ncp->num_aggrs_per_node > 0 && ncp->rank != ncp->my_aggr) + return NC_NOERR; + + /* do nothing if this came from an independent API */ + if (fIsSet(reqMode, NC_REQ_INDEP)) return NC_NOERR; + + err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL); + if (status == NC_NOERR) status = err; + + if (fIsSet(reqMode, NC_REQ_RD)) { + if (ncp->nprocs > 1) + rlen = ncmpio_file_read_at_all(ncp, 0, NULL, buf_view); + else + rlen = ncmpio_file_read_at(ncp, 0, NULL, buf_view); + if (status == NC_NOERR && rlen < 0) status = (int)rlen; + } + else { /* write request */ + if (ncp->nprocs > 1) + wlen = ncmpio_file_write_at_all(ncp, 0, NULL, buf_view); + else + wlen = ncmpio_file_write_at(ncp, 0, NULL, buf_view); + if (status == NC_NOERR && wlen < 0) status = (int)wlen; + } + + /* Reset fileview. Note fileview is never reused in PnetCDF */ + ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL); + + /* No longer need to reset the file view, as the root's fileview includes + * the whole file header. + */ + + return status; +} + +/*----< ncmpio_read_write() >------------------------------------------------*/ +int +ncmpio_read_write(NC *ncp, + int rw_flag, /* NC_REQ_WR or NC_REQ_RD */ + MPI_Offset offset, + PNCIO_View buf_view, + void *buf) +{ + char *mpi_name; + int i, status=NC_NOERR, err=NC_NOERR, mpireturn, coll_indep; + int to_free_buftype=0; + MPI_Offset rlen, wlen; + + coll_indep = NC_REQ_INDEP; + if (ncp->nprocs > 1 && !fIsSet(ncp->flags, NC_MODE_INDEP)) + coll_indep = NC_REQ_COLL; + + /* for zero-sized request */ + if (buf_view.size == 0) { + if (coll_indep == NC_REQ_INDEP) + return NC_NOERR; + + if (rw_flag == NC_REQ_RD) { + rlen = ncmpio_file_read_at_all(ncp, 0, NULL, buf_view); + if (rlen < 0) status = (int)rlen; } else { - /* update the number of bytes read since file open */ -#ifdef HAVE_MPI_GET_COUNT_C - MPI_Count get_size; - MPI_Get_count_c(&mpistatus, MPI_BYTE, &get_size); - ncp->get_size += get_size; -#else - int get_size; - mpireturn = MPI_Get_count(&mpistatus, xbuf_type, &get_size); - if (mpireturn != MPI_SUCCESS || get_size == MPI_UNDEFINED) - ncp->get_size += req_size; - else { -#ifdef HAVE_MPI_TYPE_SIZE_X - /* MPI_Type_size_x is introduced in MPI 3.0 */ - mpireturn = MPI_Type_size_x(xbuf_type, &btype_size); -#else - mpireturn = MPI_Type_size(xbuf_type, &btype_size); + wlen = ncmpio_file_write_at_all(ncp, 0, NULL, buf_view); + if (wlen < 0) status = (int)wlen; + } + goto fn_exit; + } + + /* buf_view.count is the number of offset-length pairs */ + + /* buf_view.size is in bytes, may be > NC_MAX_INT */ + + if (rw_flag == NC_REQ_RD) { + void *xbuf=buf; + +#ifndef HAVE_MPI_LARGE_COUNT + if (buf_view.size > NC_MAX_INT) { +#ifdef PNETCDF_DEBUG + fprintf(stderr,"%d: %s line %d: NC_EINTOVERFLOW buffer size="OFFFMT"\n", + ncp->rank, __func__,__LINE__,buf_view.size); #endif - if (mpireturn != MPI_SUCCESS || get_size == MPI_UNDEFINED) - ncp->get_size += req_size; - else - ncp->get_size += btype_size * get_size; + if (coll_indep == NC_REQ_COLL) { + DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW) + /* write nothing, but participate the collective call */ + buf_view.size = 0; } + else + DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) + } #endif + +// printf("%s at %d: buf_view count=%lld type=%s size=%lld\n",__func__,__LINE__, buf_view.count, (buf_view.type==MPI_BYTE)?"MPI_BYTE":"NOT MPI_BYTE", buf_view.size); + + if (!buf_view.is_contig && buf_view.size <= ncp->ibuf_size) { + /* The only case of read buffer being noncontiguous is when + * nonblocking API ncmpi_wait/wait_all() is called and INA is + * disabled. If read buffer is noncontiguous and size is < + * ncp->ibuf_size, we allocate a temporary contiguous buffer and + * use it to read. Later it is unpacked to user buffer. As some + * MPI, e.g. Cray on KNL, can be significantly slow when write + * buffer is noncontiguous. + * + * Note ncp->ibuf_size is never > NC_MAX_INT. + */ + xbuf = NCI_Malloc(buf_view.size); + buf_view.type = MPI_BYTE; + buf_view.is_contig = 1; } - if (xbuf != buf) { /* unpack contiguous xbuf to noncontiguous buf */ + + if (!buf_view.is_contig && ncp->fstype == PNCIO_FSTYPE_MPIIO) { + /* construct a buftype */ #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count pos=0; - mpireturn = MPI_Unpack_c(xbuf, xlen, &pos, buf, (MPI_Count)buf_count, - buf_type, MPI_COMM_SELF); - mpi_name = "MPI_Unpack_c"; + /* TODO: MPI_Type_create_hindexed_c + * buf_view.count should be of type MPI_Count + * buf_view.len should be of type MPI_Count + * buf_view.off should be of type MPI_Count + */ + mpireturn = MPI_Type_create_hindexed_c(buf_view.count, + buf_view.len, + buf_view.off, + MPI_BYTE, &buf_view.type); + mpi_name = "MPI_Type_create_hindexed_c"; +#else + MPI_Aint *disp; +#if SIZEOF_MPI_AINT == SIZEOF_MPI_OFFSET + disp = (MPI_Aint*) buf_view.off; #else - int pos=0; - mpireturn = MPI_Unpack(xbuf, xlen, &pos, buf, (int)buf_count, - buf_type, MPI_COMM_SELF); - mpi_name = "MPI_Unpack"; + disp = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * buf_view.count); + for (j=0; jnprocs > 1 && coll_indep == NC_REQ_COLL) + rlen = ncmpio_file_read_at_all(ncp, offset, xbuf, buf_view); + else + rlen = ncmpio_file_read_at(ncp, offset, xbuf, buf_view); + if (status == NC_NOERR && rlen < 0) status = (int)rlen; + + if (xbuf != buf) { /* unpack contiguous xbuf to noncontiguous buf */ + char *in_ptr, *out_ptr; + in_ptr = xbuf; + +#if 0 + long long *wkl, nelems; int j; + wkl = (long long*) malloc(buf_view.size); + nelems=buf_view.size/8; + memcpy(wkl, xbuf, nelems*8); ncmpii_in_swapn(wkl, nelems, 8); + printf("%s at %d: nelems=%lld xbuf=(%p) ",__func__,__LINE__, nelems, xbuf); + for (i=0; i NC_MAX_INT) { - if (coll_indep == NC_REQ_COLL) { -#ifdef PNETCDF_DEBUG - fprintf(stderr,"%d: %s line %d: NC_EINTOVERFLOW buf_count="OFFFMT"\n", - ncp->rank, __func__,__LINE__,buf_count); + if (!buf_view.is_contig && buf_view.size <= ncp->ibuf_size) { + /* The only case of write buffer being noncontiguous is when + * nonblocking API ncmpi_wait/wait_all() is called and INA is + * disabled. If write buffer is noncontiguous and size is < + * ncp->ibuf_size, pack it a temporary contiguous buffer and use it + * to write. As some MPI, e.g. Cray on KNL, can be significantly + * slow when write buffer is noncontiguous. + * + * Note ncp->ibuf_size is never > NC_MAX_INT. + */ + char *in_ptr, *out_ptr; + xbuf = NCI_Malloc(buf_view.size); + out_ptr = xbuf; +assert(buf != NULL); +// printf("%s at %d: buf_view count=%lld size=%lld\n",__func__,__LINE__, buf_view.count,buf_view.size); + +#if 0 +printf("%s at %d: buf = %p\n",__func__,__LINE__, buf); +printf("%s at %d: buf_view count=%lld off=%lld %lld len=%lld %lld\n",__func__,__LINE__, buf_view.count,buf_view.off[0],buf_view.off[1],buf_view.len[0],buf_view.len[1]); +int wkl[21]; #endif - DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW) - /* write nothing, but participate the collective call */ - xlen = 0; + for (i=0; i 0 && !buftype_is_contig && req_size <= ncp->ibuf_size) { - /* if write buffer is noncontiguous and size is < ncp->ibuf_size, - * allocate a temporary buffer and use it to write, as some MPI, - * e.g. Cray on KNL, can be significantly slow when write buffer is - * noncontiguous. - */ + if (!buf_view.is_contig && ncp->fstype == PNCIO_FSTYPE_MPIIO) { + /* construct a buftype */ #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count pos=0; - xbuf_type = MPI_BYTE; - xlen = (MPI_Count)req_size; - xbuf = NCI_Malloc(req_size); - mpireturn = MPI_Pack_c(buf, (MPI_Count)buf_count, buf_type, xbuf, - (MPI_Count)req_size, &pos, MPI_COMM_SELF); - mpi_name = "MPI_Pack_c"; + /* TODO: MPI_Type_create_hindexed_c + * buf_view.count should be of type MPI_Count + * buf_view.len should be of type MPI_Count + * buf_view.off should be of type MPI_Count + */ + mpireturn = MPI_Type_create_hindexed_c(buf_view.count, + buf_view.len, + buf_view.off, + MPI_BYTE, &buf_view.type); + mpi_name = "MPI_Type_create_hindexed_c"; #else - if (req_size > NC_MAX_INT) { - /* skip packing write data into a temp buffer */ - xlen = (int)buf_count; - xbuf_type = buf_type; - mpireturn = MPI_SUCCESS; - } - else { - int pos=0; - xbuf_type = MPI_BYTE; - xlen = (int)req_size; - xbuf = NCI_Malloc(xlen); - mpireturn = MPI_Pack(buf, (int)buf_count, buf_type, xbuf, - xlen, &pos, MPI_COMM_SELF); - mpi_name = "MPI_Pack"; - } + MPI_Aint *disp; +#if SIZEOF_MPI_AINT == SIZEOF_MPI_OFFSET + disp = (MPI_Aint*) buf_view.off; +#else + disp = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * buf_view.count); + for (j=0; jnprocs > 1 && coll_indep == NC_REQ_COLL) { + if (ncp->nprocs > 1 && coll_indep == NC_REQ_COLL) + wlen = ncmpio_file_write_at_all(ncp, offset, xbuf, buf_view); + else + wlen = ncmpio_file_write_at(ncp, offset, xbuf, buf_view); + if (status == NC_NOERR && wlen < 0) status = (int)wlen; + + if (xbuf != buf) NCI_Free(xbuf); + if (to_free_buftype) + MPI_Type_free(&buf_view.type); + } + +fn_exit: + /* Reset fileview. Note fileview is never reused in PnetCDF */ + ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL); + + return status; +} + +/*----< ncmpio_file_close() >------------------------------------------------*/ +/* + * This function is collective. + */ +int +ncmpio_file_close(NC *ncp) +{ + int err=NC_NOERR; + + if (ncp->fstype == PNCIO_FSTYPE_MPIIO) { + char *mpi_name; + int mpireturn; + + if (ncp->independent_fh != ncp->collective_fh && + ncp->independent_fh != MPI_FILE_NULL) { + TRACE_IO(MPI_File_close, (&ncp->independent_fh)); + if (mpireturn != MPI_SUCCESS) + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + } + + if (ncp->collective_fh != MPI_FILE_NULL) { + TRACE_IO(MPI_File_close, (&ncp->collective_fh)); + if (mpireturn != MPI_SUCCESS) + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + } + } + else { + /* When intra-node aggregation is enabled, only aggregators have a + * non-NULL ncp->pncio_fh and non-aggregators has pncio_fh == NULL. + */ + if (ncp->pncio_fh != NULL) { + err = PNCIO_File_close(ncp->pncio_fh); + NCI_Free(ncp->pncio_fh); + ncp->pncio_fh = NULL; + } + } + + return err; +} + +/*----< ncmpio_file_delete() >-----------------------------------------------*/ +/* + * This function is collective. + * + * This subroutine is called only from ncmpi_abort. When the file is being + * created and an error occurs, the program is still in define mode. In this + * case, the file is deleted. + */ +int +ncmpio_file_delete(NC *ncp) +{ + int err=NC_NOERR; + + if (ncp->rank == 0) { + if (ncp->fstype == PNCIO_FSTYPE_MPIIO) { + char *mpi_name; + int mpireturn; + TRACE_IO(MPI_File_delete, ((char *)ncp->path, ncp->mpiinfo)); + if (mpireturn != MPI_SUCCESS) + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + } + else + err = PNCIO_File_delete(ncp->path); + } + + if (ncp->nprocs > 1) + MPI_Bcast(&err, 1, MPI_INT, 0, ncp->comm); + + return err; +} + +/*----< ncmpio_file_sync() >-------------------------------------------------*/ +/* This function must be called collectively, no matter if it is in collective + * or independent data mode. + */ +int +ncmpio_file_sync(NC *ncp) { + char *mpi_name; + int mpireturn; + + if (ncp->fstype != PNCIO_FSTYPE_MPIIO) { + if (ncp->pncio_fh == NULL) + return NC_NOERR; + return PNCIO_File_sync(ncp->pncio_fh); + } + + /* the remaining of this subroutine are for when using MPI-IO */ + + if (ncp->independent_fh != MPI_FILE_NULL) { + TRACE_IO(MPI_File_sync, (ncp->independent_fh)); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, mpi_name); + } + /* when nprocs == 1, ncp->collective_fh == ncp->independent_fh */ + if (ncp->nprocs == 1) return NC_NOERR; + + /* When intra-node aggregation is enabled, non-aggregator's + * ncp->collective_fh is always MPI_FILE_NULL. When disabled, + * ncp->collective_fh on all ranks is never MPI_FILE_NULL as collective + * mode is default in PnetCDF. + */ + if (ncp->collective_fh != MPI_FILE_NULL) { + TRACE_IO(MPI_File_sync, (ncp->collective_fh)); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, mpi_name); + } + + /* Barrier is not necessary ... + TRACE_COMM(MPI_Barrier)(ncp->comm); + */ + + return NC_NOERR; +} + +/*----< ncmpio_file_set_view() >---------------------------------------------*/ +/* This subroutine is collective when using MPI-IO. When using internal PNCIO + * driver, this subroutine is independent. + */ +int +ncmpio_file_set_view(const NC *ncp, + MPI_Offset disp, /* IN/OUT */ + MPI_Datatype filetype, + MPI_Aint npairs, #ifdef HAVE_MPI_LARGE_COUNT - TRACE_IO(MPI_File_write_at_all_c, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus)); + MPI_Count *offsets, + MPI_Count *lengths #else - TRACE_IO(MPI_File_write_at_all, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus)); + MPI_Offset *offsets, + int *lengths #endif - } else { +) +{ + char *mpi_name; + int err, mpireturn, status=NC_NOERR; + MPI_File fh; + +assert(filetype == MPI_BYTE); +assert(disp == 0); + + if (ncp->fstype != PNCIO_FSTYPE_MPIIO) { + /* Skip setting fileview for ranks whose pncio_fh is NULL */ + if (ncp->pncio_fh == NULL) + return NC_NOERR; + + /* When PnetCDF's internal PNCIO driver is used, the request has been + * flattened into offsets and lengths. Thus passed-in filetype is not + * constructed. Note offsets and lengths are not relative to any MPI-IO + * fileview. They will be reused in PNCIO driver as a flattened file + * type struct, which avoids repeated work of constructing and + * flattening the filetype. + */ + return PNCIO_File_set_view(ncp->pncio_fh, disp, filetype, npairs, + offsets, lengths); + } + + /* Now, ncp->fstype == PNCIO_FSTYPE_MPIIO, i.e. using MPI-IO. */ + int to_free_filetype=0; + + /* when ncp->nprocs == 1, ncp->collective_fh == ncp->independent_fh */ + fh = (ncp->nprocs > 1 && !fIsSet(ncp->flags, NC_MODE_INDEP)) + ? ncp->collective_fh : ncp->independent_fh; + + if (fh == MPI_FILE_NULL) /* not INA aggregator */ + return NC_NOERR; + + if (npairs == 0) /* zero-sized requests */ + filetype = MPI_BYTE; + else { #ifdef HAVE_MPI_LARGE_COUNT - TRACE_IO(MPI_File_write_at_c, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus)); + /* construct fileview */ + mpireturn = MPI_Type_create_hindexed_c(npairs, lengths, offsets, + MPI_BYTE, &filetype); #else - TRACE_IO(MPI_File_write_at, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus)); + assert(sizeof(*offsets) == sizeof(MPI_Aint)); + /* construct fileview */ + mpireturn = MPI_Type_create_hindexed(npairs, lengths, + (MPI_Aint*)offsets, + MPI_BYTE, &filetype); #endif - } if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed"); /* return the first encountered error if there is any */ - if (status == NC_NOERR) { - err = (err == NC_EFILE) ? NC_EWRITE : err; - DEBUG_ASSIGN_ERROR(status, err) - } + if (status == NC_NOERR) status = err; } else { - /* update the number of bytes written since file open */ -#ifdef HAVE_MPI_GET_COUNT_C - MPI_Count put_size; - MPI_Get_count_c(&mpistatus, MPI_BYTE, &put_size); - ncp->put_size += put_size; -#else - int put_size; - mpireturn = MPI_Get_count(&mpistatus, xbuf_type, &put_size); - if (mpireturn != MPI_SUCCESS || put_size == MPI_UNDEFINED) - ncp->put_size += req_size; - else { -#ifdef HAVE_MPI_TYPE_SIZE_X - /* MPI_Type_size_x is introduced in MPI 3.0 */ - mpireturn = MPI_Type_size_x(xbuf_type, &btype_size); -#else - mpireturn = MPI_Type_size(xbuf_type, &btype_size); -#endif - if (mpireturn != MPI_SUCCESS || put_size == MPI_UNDEFINED) - ncp->put_size += req_size; - else - ncp->put_size += btype_size * put_size; + mpireturn = MPI_Type_commit(&filetype); + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_commit"); + /* return the first encountered error if there is any */ + if (status == NC_NOERR) status = err; } -#endif + else + to_free_filetype = 1; } - if (xbuf != buf) NCI_Free(xbuf); - if (xbuf_type != buf_type && xbuf_type != MPI_BYTE) - MPI_Type_free(&xbuf_type); } + TRACE_IO(MPI_File_set_view, (fh, disp, MPI_BYTE, filetype, "native", + MPI_INFO_NULL)); + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + if (status == NC_NOERR) status = err; +assert(0); + } + + if (to_free_filetype) + MPI_Type_free(&filetype); + return status; } +/*----< ncmpio_file_open() >-------------------------------------------------*/ +int +ncmpio_file_open(NC *ncp, + MPI_Comm comm, + const char *path, + int omode, + MPI_Info info) +{ + int err=NC_NOERR; + + /* open file collectively */ + if (ncp->fstype == PNCIO_FSTYPE_MPIIO) { + char *mpi_name; + int mpireturn; + MPI_File fh; + + TRACE_IO(MPI_File_open, (comm, path, omode, info, &fh)); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, mpi_name); + + /* Now the file has been successfully opened */ + ncp->collective_fh = fh; + ncp->independent_fh = (ncp->nprocs > 1) ? MPI_FILE_NULL : fh; + + /* get the I/O hints used/modified by MPI-IO */ + TRACE_IO(MPI_File_get_info, (fh, &ncp->mpiinfo)); + if (mpireturn != MPI_SUCCESS) + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + } + else { /* ncp->fstype != PNCIO_FSTYPE_MPIIO */ + ncp->pncio_fh = (PNCIO_File*) NCI_Calloc(1,sizeof(PNCIO_File)); + + err = PNCIO_File_open(comm, path, omode, info, ncp->pncio_fh); + if (err != NC_NOERR) return err; + + /* Now the file has been successfully opened, obtain the I/O hints + * used/modified by PNCIO driver. + */ + err = PNCIO_File_get_info(ncp->pncio_fh, &ncp->mpiinfo); + } + + return err; +} + diff --git a/src/drivers/ncmpio/ncmpio_file_misc.c b/src/drivers/ncmpio/ncmpio_file_misc.c index 932b5027f7..76292c0881 100644 --- a/src/drivers/ncmpio/ncmpio_file_misc.c +++ b/src/drivers/ncmpio/ncmpio_file_misc.c @@ -81,8 +81,7 @@ dup_NC(const NC *ref) int ncmpio_redef(void *ncdp) { - char *mpi_name; - int err, status=NC_NOERR, mpireturn; + int err, status=NC_NOERR; NC *ncp = (NC*)ncdp; #if 0 @@ -100,7 +99,7 @@ ncmpio_redef(void *ncdp) if (NC_indep(ncp)) /* exit independent mode, if in independent mode */ ncmpio_end_indep_data(ncp); - /* duplicate a header to be used in enddef() for checking if header grows */ + /* duplicate header to be used in enddef() for checking if header grows */ ncp->old = dup_NC(ncp); if (ncp->old == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM) @@ -108,21 +107,8 @@ ncmpio_redef(void *ncdp) fSet(ncp->flags, NC_MODE_DEF); /* must reset fileview as header extent may later change in enddef() */ - TRACE_IO(MPI_File_set_view, (ncp->collective_fh, 0, MPI_BYTE, - MPI_BYTE, "native", MPI_INFO_NULL)); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - DEBUG_ASSIGN_ERROR(status, err) - } - - if (ncp->independent_fh != MPI_FILE_NULL) { - TRACE_IO(MPI_File_set_view, (ncp->independent_fh, 0, MPI_BYTE, - MPI_BYTE, "native", MPI_INFO_NULL)); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - DEBUG_ASSIGN_ERROR(status, err) - } - } + err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL); + DEBUG_ASSIGN_ERROR(status, err) return status; } @@ -132,7 +118,6 @@ ncmpio_redef(void *ncdp) int ncmpio_begin_indep_data(void *ncdp) { - char *mpi_name; NC *ncp = (NC*)ncdp; if (NC_indef(ncp)) /* must not be in define mode */ @@ -151,6 +136,73 @@ ncmpio_begin_indep_data(void *ncdp) /* raise independent flag */ fSet(ncp->flags, NC_MODE_INDEP); + /* Barrier is necessary to prevent non-aggregators from calling open() + * before the file is being collectively created by the aggregators. + */ + MPI_Barrier(ncp->comm); + + if (ncp->fstype != PNCIO_FSTYPE_MPIIO) { + /* When using PnetCDF's PNCIO driver, there are 2 scenarios: + * 1. When intra-node aggregation (INA) is enabled, at the end of + * ncmpi_create/ncmpi_open, non-aggregators' pncio_fh are NULL. Thus + * switching to independent data mode, we can re-use pncio_fh to + * store file handler of file opened with MPI_COMM_SELF. Note + * whether pncio_fh is NULL or not does not tell whether INA is + * enabled or not. + * 2. When INA is disabled, all ranks calls PNCIO_File_open() and thus + * pncio_fh should not be NULL. In other word, this scenario should + * not reach here at all. Because PnetCDF's PNCIO driver relaxes + * File_setview subroutine to be able to called independently, the + * same pncio_fh can be used for both collective and independent I/O + * APIs. Note we cannot re-used pncio_fh for the above scenario 1, + * because in the collective data mode, all ranks must participate + * each collective I/O call, + */ + int err; + char *filename; + + if (ncp->pncio_fh != NULL) + /* Only INA non-aggregators' pncio_fh can be NULL, because + * aggregators open the file collectively and their pncio_fh can + * never be NULL. + */ + return NC_NOERR; + + filename = ncmpii_remove_file_system_type_prefix(ncp->path); + + ncp->pncio_fh = (PNCIO_File*) NCI_Calloc(1,sizeof(PNCIO_File)); + ncp->pncio_fh->file_system = ncp->fstype; + ncp->pncio_fh->node_ids.num_nodes = 1; + ncp->pncio_fh->node_ids.ids = (int*) NCI_Malloc(sizeof(int)); + ncp->pncio_fh->node_ids.ids[0] = 0; + + int omode = fClr(ncp->mpiomode, MPI_MODE_CREATE); + + err = PNCIO_File_open(MPI_COMM_SELF, filename, omode, ncp->mpiinfo, + ncp->pncio_fh); + if (err != NC_NOERR) + return err; + + /* Get the I/O hints used/modified by MPI-IO. Note ncp->mpiinfo may + * have been populated. It can be discarded and replaced by the one + * used by MPI-IO. + */ + if (ncp->mpiinfo != MPI_INFO_NULL) + MPI_Info_free(&ncp->mpiinfo); + + err = PNCIO_File_get_info(ncp->pncio_fh, &ncp->mpiinfo); + if (err != NC_NOERR) return err; + + /* Add PnetCDF hints into ncp->mpiinfo */ + ncmpio_hint_set(ncp, ncp->mpiinfo); + + NCI_Free(ncp->pncio_fh->node_ids.ids); + ncp->pncio_fh->node_ids.num_nodes = 0; + ncp->pncio_fh->node_ids.ids = NULL; + + return NC_NOERR; + } + /* PnetCDF's default mode is collective. MPI file handle, collective_fh, * will never be MPI_FILE_NULL. We must use a separate MPI file handle * opened with MPI_COMM_SELF, because MPI_File_set_view is a collective @@ -159,12 +211,23 @@ ncmpio_begin_indep_data(void *ncdp) * called. */ if (ncp->independent_fh == MPI_FILE_NULL) { + char *mpi_name; int mpireturn; - TRACE_IO(MPI_File_open, (MPI_COMM_SELF, ncp->path, - ncp->mpiomode, ncp->mpiinfo, - &ncp->independent_fh)); + TRACE_IO(MPI_File_open, (MPI_COMM_SELF, ncp->path, ncp->mpiomode, + ncp->mpiinfo, &ncp->independent_fh)); if (mpireturn != MPI_SUCCESS) return ncmpii_error_mpi2nc(mpireturn, mpi_name); + + /* for those ranks whose mpiinfo is NULL, retrieve info */ + if (ncp->mpiinfo == MPI_INFO_NULL) { + /* get the I/O hints used/modified by MPI-IO */ + mpireturn = MPI_File_get_info(ncp->independent_fh, &ncp->mpiinfo); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, mpi_name); + + /* Copy MPI-IO hints into ncp->mpiinfo */ + ncmpio_hint_set(ncp, ncp->mpiinfo); + } } return NC_NOERR; } @@ -242,9 +305,14 @@ ncmpio_abort(void *ncdp) } /* close the file */ - err = ncmpio_close_files(ncp, doUnlink); + err = ncmpio_file_close(ncp); if (status == NC_NOERR ) status = err; + if (doUnlink) { + err = ncmpio_file_delete(ncp); + status = (status == NC_NOERR) ? err : status; + } + /* free up space occupied by the header metadata */ ncmpio_free_NC(ncp); @@ -365,7 +433,17 @@ ncmpio_inq_misc(void *ncdp, if (recsize != NULL) *recsize = ncp->recsize; - if (header_size != NULL) *header_size = ncp->xsz; + if (header_size != NULL) { + if (NC_indef(ncp)) + /* When called in define mode, calculate and return the current + * header size. Cannot do the same for header extent, as the empty + * space depends on arguments h_minfree and v_align of + * ncmpi__enddef(). + */ + *header_size = ncmpio_hdr_len_NC(ncp); + else + *header_size = ncp->xsz; + } if (header_extent != NULL) *header_extent = ncp->begin_var; @@ -388,9 +466,12 @@ ncmpio_inq_misc(void *ncdp, sprintf(value, OFFFMT, ncp->r_align); MPI_Info_set(*info_used, "nc_record_align_size", value); - sprintf(value, "%d", ncp->chunk); + sprintf(value, "%d", ncp->hdr_chunk); MPI_Info_set(*info_used, "nc_header_read_chunk_size", value); + sprintf(value, "%d", ncp->data_chunk); + MPI_Info_set(*info_used, "nc_data_move_chunk_size", value); + if (fIsSet(ncp->flags, NC_MODE_SWAP_ON)) MPI_Info_set(*info_used, "nc_in_place_swap", "enable"); else if (fIsSet(ncp->flags, NC_MODE_SWAP_OFF)) @@ -444,12 +525,23 @@ int ncmpi_delete(const char *filename, MPI_Info info) { + int err = NC_NOERR; +#ifdef MIMIC_LUSTRE + char *path = ncmpii_remove_file_system_type_prefix(filename); + err = unlink(path); + if (err != 0) + err = ncmpii_error_posix2nc("unlink"); +#else + err = PNCIO_File_delete(filename); +#if 0 char *mpi_name; - int err=NC_NOERR, mpireturn; + int mpireturn; - TRACE_IO(MPI_File_delete, ((char*)filename, info)); + TRACE_IO(MPI_File_delete, (filename, info)); if (mpireturn != MPI_SUCCESS) err = ncmpii_error_mpi2nc(mpireturn, mpi_name); +#endif +#endif return err; } diff --git a/src/drivers/ncmpio/ncmpio_filetype.c b/src/drivers/ncmpio/ncmpio_filetype.c index 828ab41325..3d84d54079 100644 --- a/src/drivers/ncmpio/ncmpio_filetype.c +++ b/src/drivers/ncmpio/ncmpio_filetype.c @@ -506,6 +506,9 @@ ncmpio_filetype_create_vars(const NC *ncp, MPI_Offset i, nblocks, nelems, *blocklens; MPI_Datatype filetype=MPI_BYTE; +/* This is no longer used, as all requests go to INA subroutines to flatten. */ +assert(0); + if (stride == NULL) return filetype_create_vara(ncp, varp, start, count, offset_ptr, filetype_ptr, is_filetype_contig); @@ -606,105 +609,3 @@ ncmpio_filetype_create_vars(const NC *ncp, return err; } -/*----< ncmpio_file_set_view() >---------------------------------------------*/ -/* This function handles the special case for root process for setting its - * file view: to keeps the whole file header visible to the root process. This - * is because the root process may update the number of records or attributes - * into the file header while in data mode. In PnetCDF design, only root - * process can read/write the file header. - * This function is collective if called in collective data mode - */ -int -ncmpio_file_set_view(const NC *ncp, - MPI_File fh, - MPI_Offset *offset, /* IN/OUT */ - MPI_Datatype filetype) -{ - char *mpi_name; - int err, mpireturn, status=NC_NOERR; - - if (filetype == MPI_BYTE) { - /* filetype is a contiguous space, make the whole file visible */ - TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, - "native", MPI_INFO_NULL)); - return NC_NOERR; - } - - if (ncp->rank == 0) { - /* prepend the whole file header to filetype */ - MPI_Datatype root_filetype=MPI_BYTE, ftypes[2]; -#ifdef HAVE_MPI_LARGE_COUNT - MPI_Count blocklens[2]; - MPI_Count disps[2]; - blocklens[0] = ncp->begin_var; -#else - int blocklens[2]; - MPI_Aint disps[2]; - - /* check if header size > 2^31 */ - if (ncp->begin_var > NC_MAX_INT) { - DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW); - goto err_out; - } - - blocklens[0] = (int)ncp->begin_var; -#endif - - /* first block is the header extent */ - disps[0] = 0; - ftypes[0] = MPI_BYTE; - - /* second block is filetype, the subarray request(s) to the variable */ - blocklens[1] = 1; - disps[1] = *offset; - ftypes[1] = filetype; - -#if !defined(HAVE_MPI_LARGE_COUNT) && (SIZEOF_MPI_AINT != SIZEOF_MPI_OFFSET) - if (*offset > NC_MAX_INT) { - DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW); - goto err_out; - } -#endif - -#ifdef HAVE_MPI_LARGE_COUNT - mpireturn = MPI_Type_create_struct_c(2, blocklens, disps, ftypes, - &root_filetype); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_struct_c"); - if (status == NC_NOERR) status = err; - } -#else - mpireturn = MPI_Type_create_struct(2, blocklens, disps, ftypes, - &root_filetype); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_struct"); - if (status == NC_NOERR) status = err; - } -#endif - MPI_Type_commit(&root_filetype); - -#ifndef HAVE_MPI_LARGE_COUNT -err_out: -#endif - TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, root_filetype, "native", - MPI_INFO_NULL)); - if (root_filetype != MPI_BYTE) - MPI_Type_free(&root_filetype); - - /* now update the explicit offset to be used in MPI-IO call later */ - *offset = ncp->begin_var; - } - else { - TRACE_IO(MPI_File_set_view, (fh, *offset, MPI_BYTE, filetype, "native", - MPI_INFO_NULL)); - /* the explicit offset is already set in fileview */ - *offset = 0; - } - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (status == NC_NOERR) status = err; - } - - return status; -} - diff --git a/src/drivers/ncmpio/ncmpio_fill.c b/src/drivers/ncmpio/ncmpio_fill.c index e392a366d9..2ddbd63247 100644 --- a/src/drivers/ncmpio/ncmpio_fill.c +++ b/src/drivers/ncmpio/ncmpio_fill.c @@ -144,13 +144,33 @@ fill_var_rec(NC *ncp, NC_var *varp, MPI_Offset recno) /* record number */ { - char *mpi_name; int err, status=NC_NOERR, mpireturn; void *buf; - MPI_Offset var_len, start, count, offset; - MPI_File fh; - MPI_Status mpistatus; - MPI_Datatype bufType; + MPI_Offset var_len, start, count, offset, wlen; + PNCIO_View buf_view; + + buf_view.type = MPI_BYTE; + buf_view.count = 0; + buf_view.is_contig = 1; + buf_view.size = 0; + buf_view.off = NULL; + buf_view.len = NULL; + + /* When intra-node aggregation is enabled, use the communicator consisting + * of aggregators in comm, nprocs, and rank. Non-aggregators do not + * participate the fill operation. + */ + MPI_Comm comm = ncp->comm; + int nprocs = ncp->nprocs; + int rank = ncp->rank; + if (ncp->num_aggrs_per_node > 0) { + if (ncp->my_aggr != ncp->rank) + return NC_NOERR; + + comm = ncp->ina_comm; + nprocs = ncp->ina_nprocs; + rank = ncp->ina_rank; + } if (varp->ndims == 0) /* scalar variable */ var_len = 1; @@ -162,14 +182,14 @@ fill_var_rec(NC *ncp, var_len = varp->dsizes[0]; /* divide total number of elements of this variable among all processes */ - count = var_len / ncp->nprocs; - start = count * ncp->rank; - if (ncp->rank < var_len % ncp->nprocs) { - start += ncp->rank; + count = var_len / nprocs; + start = count * rank; + if (rank < var_len % nprocs) { + start += rank; count++; } else { - start += var_len % ncp->nprocs; + start += var_len % nprocs; } /* allocate buffer space */ @@ -179,64 +199,45 @@ fill_var_rec(NC *ncp, err = fill_var_buf(varp, count, buf); if (err != NC_NOERR) { NCI_Free(buf); - count = 0; /* still participate collective calls below */ + /* still participate collective calls below */ + buf_view.size = 0; status = err; } + /* make the entire file visible */ + err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL); + status = (status == NC_NOERR) ? err : status; + /* calculate the starting file offset for each process */ offset = varp->begin; if (IS_RECVAR(varp)) offset += ncp->recsize * recno; offset += start * varp->xsz; - /* when ncp->nprocs == 1, we keep I/O mode in independent mode at all time */ - fh = ncp->collective_fh; - - /* make the entire file visible */ - TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native", - MPI_INFO_NULL)); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (status == NC_NOERR) status = err; - } - count *= varp->xsz; - bufType = MPI_BYTE; - #ifndef HAVE_MPI_LARGE_COUNT if (count > NC_MAX_INT) { DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) if (status == NC_NOERR) status = err; - count = 0; /* participate collective write with 0-length request */ + /* participate collective write with 0-length request */ + buf_view.size = 0; } #endif + if (status == NC_NOERR) + buf_view.size = count; + +// if (ncp->rank ==0) printf("%s at %d: buf_view count=%lld size=%lld offset=%lld\n",__func__,__LINE__, buf_view.count,buf_view.size,offset); + /* write to variable collectively */ - if (ncp->nprocs > 1) { -#ifdef HAVE_MPI_LARGE_COUNT - TRACE_IO(MPI_File_write_at_all_c, (fh, offset, buf, (MPI_Count)count, - bufType, &mpistatus)); -#else - TRACE_IO(MPI_File_write_at_all, (fh, offset, buf, (int)count, - bufType, &mpistatus)); -#endif - } - else { -#ifdef HAVE_MPI_LARGE_COUNT - TRACE_IO(MPI_File_write_at_c, (fh, offset, buf, (MPI_Count)count, - bufType, &mpistatus)); -#else - TRACE_IO(MPI_File_write_at, (fh, offset, buf, (int)count, - bufType, &mpistatus)); -#endif - } + if (nprocs > 1) + wlen = ncmpio_file_write_at_all(ncp, offset, buf, buf_view); + else + wlen = ncmpio_file_write_at(ncp, offset, buf, buf_view); + if (status == NC_NOERR && wlen < 0) status = (int)wlen; + NCI_Free(buf); - if (bufType != MPI_BYTE) MPI_Type_free(&bufType); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (status == NC_NOERR) status = err; - } if (status != NC_NOERR) return status; @@ -248,9 +249,9 @@ fill_var_rec(NC *ncp, * First, find the max numrecs among all processes. */ MPI_Offset max_numrecs=recno+1; - if (ncp->nprocs > 1) { + if (nprocs > 1) { TRACE_COMM(MPI_Allreduce)(MPI_IN_PLACE, &max_numrecs, 1, MPI_OFFSET, - MPI_MAX, ncp->comm); + MPI_MAX, comm); if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn, "MPI_Allreduce"); if (status == NC_NOERR) status = err; @@ -363,24 +364,36 @@ fill_added_recs(NC *ncp, NC *old_ncp) static int fillerup_aggregate(NC *ncp, NC *old_ncp) { - int i, j, k, mpireturn, err, status=NC_NOERR; + int i, j, k, err, status=NC_NOERR; int start_vid, recno, nVarsFill; - char *buf_ptr, *noFill, *mpi_name; + char *buf_ptr, *noFill; void *buf; size_t nsegs; - MPI_Offset buf_len, var_len, nrecs, start, *count; - MPI_Datatype filetype, bufType; - MPI_File fh; - MPI_Status mpistatus; + MPI_Offset buf_len, var_len, nrecs, start, *count, wlen; NC_var *varp; + PNCIO_View buf_view; #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count *blocklengths, *offset; + MPI_Count *blocklengths=NULL, *offset=NULL; #else - int *blocklengths; - MPI_Aint *offset; + int *blocklengths=NULL; + MPI_Offset *offset=NULL; #endif + /* When intra-node aggregation is enabled, use the communicator consisting + * of aggregators in comm, nprocs, and rank. Non-aggregators do not + * participate the fill operation. + */ + int nprocs = ncp->nprocs; + int rank = ncp->rank; + if (ncp->num_aggrs_per_node > 0) { + if (ncp->my_aggr != ncp->rank) + return NC_NOERR; + + nprocs = ncp->ina_nprocs; + rank = ncp->ina_rank; + } + /* find the starting vid for newly added variables */ start_vid = 0; nrecs = 0; /* the current number of records */ @@ -397,12 +410,16 @@ fillerup_aggregate(NC *ncp, NC *old_ncp) * variables' fill modes and overwrite local's if an inconsistency is found * Note ncp->vars.ndefined is already made consistent by this point. */ - if (ncp->nprocs > 1) { + MPI_Comm comm = (ncp->num_aggrs_per_node > 0) ? ncp->ina_comm : ncp->comm; + + if (nprocs > 1) { + int mpireturn; + for (i=start_vid; ivars.ndefined; i++) noFill[i-start_vid] = (char)(ncp->vars.value[i]->no_fill); TRACE_COMM(MPI_Bcast)(noFill, (ncp->vars.ndefined - start_vid), - MPI_BYTE, 0, ncp->comm); + MPI_BYTE, 0, comm); if (mpireturn != MPI_SUCCESS) return ncmpii_error_mpi2nc(mpireturn, "MPI_Bcast"); @@ -427,9 +444,9 @@ fillerup_aggregate(NC *ncp, NC *old_ncp) nsegs = (size_t)(ncp->vars.ndefined + ncp->vars.num_rec_vars * nrecs); count = (MPI_Offset*) NCI_Malloc(sizeof(MPI_Offset) * nsegs); #ifdef HAVE_MPI_LARGE_COUNT - offset = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * nsegs); + offset = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * nsegs); #else - offset = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * nsegs); + offset = (MPI_Offset*) NCI_Malloc(sizeof(MPI_Offset) * nsegs); #endif /* calculate each segment's offset and count */ @@ -446,19 +463,23 @@ fillerup_aggregate(NC *ncp, NC *old_ncp) else var_len = varp->dsizes[0]; /* divide evenly total number of variable's elements among processes */ - count[j] = var_len / ncp->nprocs; - start = count[j] * ncp->rank; - if (ncp->rank < var_len % ncp->nprocs) { - start += ncp->rank; + count[j] = var_len / nprocs; + start = count[j] * rank; + if (rank < var_len % nprocs) { + start += rank; count[j]++; } else - start += var_len % ncp->nprocs; + start += var_len % nprocs; /* calculate the starting file offset */ start *= varp->xsz; start += varp->begin; - offset[j] = (MPI_Aint)start; +#ifdef HAVE_MPI_LARGE_COUNT + offset[j] = (MPI_Count)start; +#else + offset[j] = start; +#endif if (start != offset[j]) { DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) if (status == NC_NOERR) status = err; @@ -483,19 +504,23 @@ fillerup_aggregate(NC *ncp, NC *old_ncp) else var_len = varp->dsizes[1]; /* divide total number of variable's elements among all processes */ - count[j] = var_len / ncp->nprocs; - start = count[j] * ncp->rank; - if (ncp->rank < var_len % ncp->nprocs) { - start += ncp->rank; + count[j] = var_len / nprocs; + start = count[j] * rank; + if (rank < var_len % nprocs) { + start += rank; count[j]++; } else - start += var_len % ncp->nprocs; + start += var_len % nprocs; /* calculate the starting file offset */ start *= varp->xsz; start += varp->begin + ncp->recsize * recno; - offset[j] = (MPI_Aint)start; +#ifdef HAVE_MPI_LARGE_COUNT + offset[j] = (MPI_Count)start; +#else + offset[j] = start; +#endif if (start != offset[j]) { DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) if (status == NC_NOERR) status = err; @@ -597,53 +622,26 @@ fillerup_aggregate(NC *ncp, NC *old_ncp) } /* k is the number of valid write requests */ NCI_Free(noFill); - - if (k == 0) { - filetype = MPI_BYTE; - } - else { - /* create fileview: a list of contiguous segment for each variable */ -#ifdef HAVE_MPI_LARGE_COUNT - mpireturn = MPI_Type_create_hindexed_c(k, blocklengths, offset, - MPI_BYTE, &filetype); -#else - mpireturn = MPI_Type_create_hindexed(k, blocklengths, offset, - MPI_BYTE, &filetype); -#endif - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_hindexed"); - /* return the first encountered error if there is any */ - if (status == NC_NOERR) status = err; - } - else - MPI_Type_commit(&filetype); - } - - NCI_Free(blocklengths); NCI_Free(count); - NCI_Free(offset); - - /* when nprocs == 1, we keep I/O mode in independent mode at all time */ - fh = ncp->collective_fh; - TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, filetype, "native", - MPI_INFO_NULL)); - if (k > 0) MPI_Type_free(&filetype); + err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, k, offset, blocklengths); + status = (status == NC_NOERR) ? err : status; - bufType = MPI_BYTE; + buf_view.type = MPI_BYTE; if (buf_len > NC_MAX_INT) { #ifdef HAVE_MPI_LARGE_COUNT + int mpireturn; + mpireturn = MPI_Type_contiguous_c((MPI_Count)buf_len, MPI_BYTE, - &bufType); + &buf_view.type); if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_contiguous_c"); /* return the first encountered error if there is any */ if (status == NC_NOERR) status = err; - buf_len = 0; + buf_view.size = 0; } else { - MPI_Type_commit(&bufType); - buf_len = 1; + MPI_Type_commit(&buf_view.type); } #else if (status == NC_NOERR) @@ -653,39 +651,30 @@ fillerup_aggregate(NC *ncp, NC *old_ncp) #endif } + /* write buffer is contiguous */ + buf_view.size = buf_len; + buf_view.count = 0; + buf_view.off = NULL; + buf_view.len = NULL; + buf_view.is_contig = 1; + /* write to variable collectively */ - if (ncp->nprocs > 1) { -#ifdef HAVE_MPI_LARGE_COUNT - TRACE_IO(MPI_File_write_at_all_c, (fh, 0, buf, (MPI_Count)buf_len, - bufType, &mpistatus)); -#else - TRACE_IO(MPI_File_write_at_all, (fh, 0, buf, (int)buf_len, - bufType, &mpistatus)); -#endif - } - else { -#ifdef HAVE_MPI_LARGE_COUNT - TRACE_IO(MPI_File_write_at_c, (fh, 0, buf, (MPI_Count)buf_len, - bufType, &mpistatus)); -#else - TRACE_IO(MPI_File_write_at, (fh, 0, buf, (int)buf_len, - bufType, &mpistatus)); -#endif - } + if (nprocs > 1) + wlen = ncmpio_file_write_at_all(ncp, 0, buf, buf_view); + else + wlen = ncmpio_file_write_at(ncp, 0, buf, buf_view); + if (status == NC_NOERR && wlen < 0) status = (int)wlen; + /* free allocated resources */ NCI_Free(buf); - if (bufType != MPI_BYTE) MPI_Type_free(&bufType); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (status == NC_NOERR) status = err; - } + if (buf_view.type != MPI_BYTE) MPI_Type_free(&buf_view.type); + if (blocklengths != NULL) NCI_Free(blocklengths); + if (offset != NULL) NCI_Free(offset); + + /* reset fileview to make the entire file visible */ + err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL); + status = (status == NC_NOERR) ? err : status; - TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native", - MPI_INFO_NULL)); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (status == NC_NOERR) status = err; - } return status; } @@ -753,7 +742,7 @@ ncmpio_fill_var_rec(void *ncdp, } err_check: - if (ncp->safe_mode && ncp->nprocs > 1) { /* consistency check */ + if (fIsSet(ncp->flags, NC_MODE_SAFE) && ncp->nprocs > 1) { /* consistency check */ int root_varid, status, mpireturn; MPI_Offset root_recno; @@ -801,7 +790,7 @@ ncmpio_set_fill(void *ncdp, int i, mpireturn, oldmode; NC *ncp = (NC*)ncdp; - if (ncp->safe_mode && ncp->nprocs > 1) { + if (fIsSet(ncp->flags, NC_MODE_SAFE) && ncp->nprocs > 1) { int err, status, root_fill_mode=fill_mode; TRACE_COMM(MPI_Bcast)(&root_fill_mode, 1, MPI_INT, 0, ncp->comm); @@ -860,7 +849,7 @@ ncmpio_def_var_fill(void *ncdp, /* sanity check for ncdp and varid has been done in dispatchers */ varp = ncp->vars.value[varid]; - if (ncp->safe_mode && ncp->nprocs > 1) { + if (fIsSet(ncp->flags, NC_MODE_SAFE) && ncp->nprocs > 1) { int root_ids[3], my_fill_null, minE, mpireturn; /* check if varid, no_fill, fill_value, are consistent */ diff --git a/src/drivers/ncmpio/ncmpio_getput.m4 b/src/drivers/ncmpio/ncmpio_getput.m4 index 363701cac2..9c0364abc1 100644 --- a/src/drivers/ncmpio/ncmpio_getput.m4 +++ b/src/drivers/ncmpio/ncmpio_getput.m4 @@ -44,20 +44,22 @@ dnl #include "ncmpio_subfile.h" #endif +#define ALWAYS_USE_INA + /* buffer layers: For write requests: buf (user buffer of internal data type) lbuf (contiguous buffer packed from buf based on buftype) cbuf (contiguous buffer packed from lbuf based on imap) - xbuf (contiguous buffer in external data type, type-casted/byte-swapped + xbuf (contiguous buffer in external data type, type-cast/byte-swapped from cbuf, ready to be used in MPI_File_write to write to file) For read requests: xbuf (contiguous buffer to be used in MPI_File_read to read from file. Its contents are in external data type) - cbuf (contiguous buffer type-casted/byte-swapped from xbuf, its contents - are in internal data type) + cbuf (contiguous buffer type-cast/byte-swapped from xbuf, its contents are + in internal data type) lbuf (contiguous buffer unpacked from cbuf based on imap) buf (user buffer, unpacked from lbuf based on buftype) @@ -118,10 +120,18 @@ put_varm(NC *ncp, void *xbuf=NULL; int mpireturn, err=NC_NOERR, status=NC_NOERR, buftype_is_contig; int el_size, need_convert=0, need_swap=0, need_swap_back_buf=0; - int coll_indep, xtype_is_contig=1, can_swap_in_place; - MPI_Offset nelems=0, bnelems=0, nbytes=0, offset=0; - MPI_Datatype itype, xtype=MPI_BYTE, imaptype, filetype=MPI_BYTE; - MPI_File fh; + int can_swap_in_place; + MPI_Offset nelems=0, bnelems=0, nbytes=0; + MPI_Datatype itype, imaptype; + + if (varp == NULL) { /* zero-sized request */ + itype = MPI_BYTE; + el_size = 0; + bnelems = 0; + nbytes = 0; + buftype_is_contig = 0; + goto err_check; + } /* decode buftype to obtain the followings: * itype: element data type (MPI primitive type) in buftype @@ -135,20 +145,10 @@ put_varm(NC *ncp, * el_size: byte size of itype * buftype_is_contig: whether buftype is contiguous */ - if (varp == NULL) { /* zero-sized request */ - itype = MPI_BYTE; - el_size = 0; - bnelems = 0; - nbytes = 0; - buftype_is_contig = 0; - } - else { - err = ncmpii_buftype_decode(varp->ndims, varp->xtype, count, bufcount, - buftype, &itype, &el_size, &bnelems, - &nbytes, &buftype_is_contig); - if (err != NC_NOERR) goto err_check; - } - xtype_is_contig = buftype_is_contig; + err = ncmpii_buftype_decode(varp->ndims, varp->xtype, count, bufcount, + buftype, &itype, &el_size, &bnelems, &nbytes, + &buftype_is_contig); + if (err != NC_NOERR) goto err_check; if (buftype == MPI_DATATYPE_NULL) { /* buftype and bufcount are ignored */ bufcount = bnelems; @@ -174,10 +174,15 @@ put_varm(NC *ncp, goto err_check; /* check if type conversion and Endianness byte swap is needed */ - if (varp != NULL) { /* non-zero-sized request */ - need_convert = ncmpii_need_convert(ncp->format, varp->xtype, itype); - need_swap = NEED_BYTE_SWAP(varp->xtype, itype); - } + need_convert = ncmpii_need_convert(ncp->format, varp->xtype, itype); + need_swap = NEED_BYTE_SWAP(varp->xtype, itype); + + /* check whether this is a true varm call, if yes, imaptype will be a + * newly created MPI derived data type, otherwise MPI_DATATYPE_NULL + */ + imaptype = MPI_DATATYPE_NULL; + err = ncmpii_create_imaptype(varp->ndims, count, imap, itype, &imaptype); + if (err != NC_NOERR) goto err_check; /* check if in-place byte swap can be enabled */ can_swap_in_place = 1; @@ -190,25 +195,23 @@ put_varm(NC *ncp, else if (! fIsSet(ncp->flags, NC_MODE_SWAP_ON)) { /* auto mode, as user does not explicitly enable it */ if (nbytes <= NC_BYTE_SWAP_BUFFER_SIZE) - /* If write amount is small, disable in-place swap. - * This is because the user buffer may be immutable. In this - * case, in-place swap will cause segmentation fault. Immutable - * buffers are usually small. */ + /* If write amount is small, disable in-place swap. This is + * because the user buffer may be immutable. In this case, + * in-place swap will cause segmentation fault. Immutable + * buffers are usually small. + */ can_swap_in_place = 0; } } - /* check whether this is a true varm call, if yes, imaptype will be a - * newly created MPI derived data type, otherwise MPI_DATATYPE_NULL - */ - imaptype = MPI_DATATYPE_NULL; - if (varp != NULL) { /* non-zero-sized request */ - err = ncmpii_create_imaptype(varp->ndims, count, imap, itype, &imaptype); - if (err != NC_NOERR) goto err_check; - } - +#ifdef ALWAYS_USE_INA + if (!need_convert && imaptype == MPI_DATATYPE_NULL && buftype_is_contig && + (!need_swap || can_swap_in_place)) +#else if (!need_convert && imaptype == MPI_DATATYPE_NULL && - (!need_swap || (can_swap_in_place && buftype_is_contig))) { + (!need_swap || (can_swap_in_place && buftype_is_contig))) +#endif + { /* reuse buftype, bufcount, buf in later MPI file write */ xbuf = buf; if (need_swap) { @@ -216,17 +219,17 @@ put_varm(NC *ncp, need_swap_back_buf = 1; } } - else if (varp != NULL) { + else { xbuf = NCI_Malloc((size_t)nbytes); if (xbuf == NULL) { DEBUG_ASSIGN_ERROR(err, NC_ENOMEM) goto err_check; } need_swap_back_buf = 0; - xtype_is_contig = 1; - /* pack buf to xbuf, byte-swap and type-convert on xbuf, which - * will later be used in MPI file write */ + /* Pack buf to xbuf, byte-swap and type-convert on xbuf, which will + * later be used in MPI file write. + */ err = ncmpio_pack_xbuf(ncp->format, varp, bufcount, buftype, buftype_is_contig, bnelems, itype, el_size, imaptype, need_convert, need_swap, nbytes, buf, @@ -238,16 +241,14 @@ put_varm(NC *ncp, } } - /* Set nelems and xtype which will be used in MPI read/write */ - if (buf != xbuf && varp != NULL) { + /* Set nelems which will be used in MPI read/write */ + if (buf != xbuf) { /* xbuf is a contiguous buffer */ - xtype = ncmpii_nc2mpitype(varp->xtype); nelems = bnelems; } else { /* we can safely use bufcount and buftype in MPI File read/write */ nelems = (bufcount == NC_COUNT_IGNORE) ? bnelems : bufcount; - xtype = buftype; } err_check: @@ -263,12 +264,22 @@ err_check: */ nbytes = 0; nelems = 0; - filetype = MPI_BYTE; - xtype = MPI_BYTE; } - if (fIsSet(reqMode, NC_REQ_COLL) && ncp->my_aggr >= 0 && ncp->nprocs > 1) { - /* intra-node write aggregation must be in collective mode */ +#ifdef ALWAYS_USE_INA + err = ncmpio_ina_req(ncp, NC_REQ_WR, varp, start, count, stride, nbytes, + xbuf); + if (status == NC_NOERR) status = err; +#else + MPI_Offset offset=0; + MPI_Datatype filetype=MPI_BYTE, xtype; + + /* Set xtype which will be used in MPI read/write */ + xtype = (nbytes == 0) ? MPI_BYTE + : (buf != xbuf) ? ncmpii_nc2mpitype(varp->xtype) : buftype; + + if (fIsSet(reqMode, NC_REQ_COLL) && ncp->num_aggrs_per_node > 0) { + /* intra-node aggregation must be in collective mode */ void *wbuf = (nbytes == 0) ? NULL : xbuf; err = ncmpio_intra_node_aggregation(ncp, NC_REQ_WR, varp, start, count, stride, nelems, xtype, wbuf); @@ -297,15 +308,8 @@ err_check: * at a time. */ - fh = ncp->independent_fh; - coll_indep = NC_REQ_INDEP; - if (ncp->nprocs > 1 && fIsSet(reqMode, NC_REQ_COLL)) { - fh = ncp->collective_fh; - coll_indep = NC_REQ_COLL; - } - /* MPI_File_set_view is collective */ - err = ncmpio_file_set_view(ncp, fh, &offset, filetype); + err = ncmpio_file_set_view(ncp, &offset, filetype, 0, NULL, NULL); if (err != NC_NOERR) { nelems = 0; /* skip this request */ if (status == NC_NOERR) status = err; @@ -316,10 +320,10 @@ err_check: * written to the variable defined in file. Note data stored in xbuf * is in the external data type, ready to be written to file. */ - err = ncmpio_read_write(ncp, NC_REQ_WR, coll_indep, offset, nelems, - xtype, xbuf, xtype_is_contig); + err = ncmpio_read_write(ncp, NC_REQ_WR, offset, nelems, xtype, xbuf); if (status == NC_NOERR) status = err; } +#endif /* done with xbuf */ if (xbuf != NULL && xbuf != buf) NCI_Free(xbuf); @@ -340,7 +344,8 @@ err_check: new_numrecs = start[0] + (count[0] - 1) * stride[0] + 1; /* note new_numrecs can be smaller than ncp->numrecs when this - * write request writes existing records */ + * write request writes existing records + */ } if (fIsSet(reqMode, NC_REQ_COLL)) { @@ -357,8 +362,9 @@ err_check: if (status == NC_NOERR) status = err; } } - /* In collective mode, ncp->numrecs is always sync-ed among - processes */ + /* In collective data mode, ncp->numrecs is always sync-ed among + * processes + */ if (ncp->numrecs < max_numrecs) { err = ncmpio_write_numrecs(ncp, max_numrecs); if (status == NC_NOERR) status = err; @@ -396,11 +402,19 @@ get_varm(NC *ncp, int reqMode) /* WR/RD/COLL/INDEP */ { void *xbuf=NULL; - int err=NC_NOERR, status=NC_NOERR, coll_indep, xtype_is_contig=1; + int err=NC_NOERR, status=NC_NOERR; int el_size, buftype_is_contig, need_swap=0, need_convert=0; - MPI_Offset nelems=0, bnelems=0, nbytes=0, offset=0; - MPI_Datatype itype, xtype=MPI_BYTE, filetype=MPI_BYTE, imaptype=MPI_DATATYPE_NULL; - MPI_File fh; + MPI_Offset nelems=0, bnelems=0, nbytes=0; + MPI_Datatype itype, imaptype=MPI_DATATYPE_NULL; + + if (varp == NULL) { /* zero-sized request */ + itype = MPI_BYTE; + el_size = 0; + bnelems = 0; + nbytes = 0; + buftype_is_contig = 0; + goto err_check; + } /* decode buftype to see if we can use buf to read from file. * itype: element data type (MPI primitive type) in buftype @@ -415,10 +429,9 @@ get_varm(NC *ncp, * buftype_is_contig: whether buftype is contiguous */ err = ncmpii_buftype_decode(varp->ndims, varp->xtype, count, bufcount, - buftype, &itype, &el_size, &bnelems, - &nbytes, &buftype_is_contig); + buftype, &itype, &el_size, &bnelems, &nbytes, + &buftype_is_contig); if (err != NC_NOERR) goto err_check; - xtype_is_contig = buftype_is_contig; if (buftype == MPI_DATATYPE_NULL) { /* buftype and bufcount are ignored */ bufcount = bnelems; @@ -461,32 +474,36 @@ get_varm(NC *ncp, * For condition 1, buftype is decoded in ncmpii_buftype_decode() * For condition 2, imap is checked in ncmpii_create_imaptype() */ +#ifdef ALWAYS_USE_INA + if (!need_convert && imaptype == MPI_DATATYPE_NULL && + !need_swap && buftype_is_contig) +#else if (!need_convert && imaptype == MPI_DATATYPE_NULL && - (!need_swap || buftype_is_contig)) { + (!need_swap || buftype_is_contig)) +#endif + { /* reuse buftype, bufcount, buf in later MPI file read */ xbuf = buf; } else { /* allocate xbuf for reading */ xbuf = NCI_Malloc((size_t)nbytes); - xtype_is_contig = 1; if (xbuf == NULL) { DEBUG_ASSIGN_ERROR(err, NC_ENOMEM) goto err_check; } } /* Note xbuf is the buffer to be used in MPI read calls, and hence its - * contents are in the external type */ + * contents are in the external type. + */ - /* Set nelems and xtype which will be used in MPI read/write */ + /* Set nelems which will be used in MPI read/write */ if (buf != xbuf) { /* xbuf is a contiguous buffer */ nelems = bnelems; - xtype = ncmpii_nc2mpitype(varp->xtype); } else { /* we can safely use bufcount and buftype in MPI File read/write */ nelems = (bufcount == NC_COUNT_IGNORE) ? bnelems : bufcount; - xtype = buftype; } err_check: @@ -496,58 +513,71 @@ err_check: /* for independent API, this process returns now */ if (fIsSet(reqMode, NC_REQ_INDEP)) return err; - /* for collective API, this process needs to participate the - * collective I/O operations, but with zero-length request + /* for collective API, this process needs to participate the collective + * I/O operations, but with zero-length request */ - filetype = MPI_BYTE; - xtype = MPI_BYTE; nbytes = 0; nelems = 0; } + +#ifdef ALWAYS_USE_INA + err = ncmpio_ina_req(ncp, NC_REQ_RD, varp, start, count, stride, nbytes, + xbuf); + if (status == NC_NOERR) status = err; +#else + MPI_Offset offset=0; + MPI_Datatype filetype=MPI_BYTE, xtype; + + /* Set xtype which will be used in MPI read/write */ + xtype = (nbytes == 0) ? MPI_BYTE + : (buf != xbuf) ? ncmpii_nc2mpitype(varp->xtype) : buftype; + + if (fIsSet(reqMode, NC_REQ_COLL) && ncp->num_aggrs_per_node > 0) { + /* intra-node aggregation must be in collective mode */ + void *rbuf = (nbytes == 0) ? NULL : xbuf; + err = ncmpio_intra_node_aggregation(ncp, NC_REQ_RD, varp, start, count, + stride, nelems, xtype, rbuf); + if (status == NC_NOERR) status = err; + } else { - /* Create the filetype for this request and calculate the beginning - * file offset for this request. If this request is contiguous in file, - * then set filetype == MPI_BYTE. Otherwise filetype will be an MPI - * derived data type. + if (nbytes > 0) { + /* Create the filetype for this request and calculate the beginning + * file offset for this request. If this request is contiguous in + * file, then set filetype == MPI_BYTE. Otherwise filetype will be + * an MPI derived data type. + */ + err = ncmpio_filetype_create_vars(ncp, varp, start, count, stride, + &offset, &filetype, NULL); + if (err != NC_NOERR) { + filetype = MPI_BYTE; + xtype = MPI_BYTE; + nbytes = 0; + nelems = 0; + if (status == NC_NOERR) status = err; + } + } + + /* TODO: if record variables are too big (so big that we cannot store + * the stride between records in an MPI_Aint, for example) then we will + * have to process this one record at a time. */ - err = ncmpio_filetype_create_vars(ncp, varp, start, count, stride, - &offset, &filetype, NULL); + + /* MPI_File_set_view is collective */ + err = ncmpio_file_set_view(ncp, &offset, filetype, 0, NULL, NULL); if (err != NC_NOERR) { - filetype = MPI_BYTE; - xtype = MPI_BYTE; - nbytes = 0; - nelems = 0; + nelems = 0; /* skip this request */ if (status == NC_NOERR) status = err; } - } - - /* TODO: if record variables are too big (so big that we cannot store the - * stride between records in an MPI_Aint, for example) then we will - * have to process this one record at a time. - */ - - fh = ncp->independent_fh; - coll_indep = NC_REQ_INDEP; - if (ncp->nprocs > 1 && fIsSet(reqMode, NC_REQ_COLL)) { - fh = ncp->collective_fh; - coll_indep = NC_REQ_COLL; - } + if (filetype != MPI_BYTE) MPI_Type_free(&filetype); - /* MPI_File_set_view is collective */ - err = ncmpio_file_set_view(ncp, fh, &offset, filetype); - if (err != NC_NOERR) { - nelems = 0; /* skip this request */ + /* xtype is the element data type (MPI primitive type) in xbuf to be + * read from the variable defined in file. Note xbuf will contain data + * read from the file and hence is in the external data type. + */ + err = ncmpio_read_write(ncp, NC_REQ_RD, offset, nelems, xtype, xbuf); if (status == NC_NOERR) status = err; } - if (filetype != MPI_BYTE) MPI_Type_free(&filetype); - - /* xtype is the element data type (MPI primitive type) in xbuf to be - * read from the variable defined in file. Note xbuf will contain data read - * from the file and hence is in the external data type. - */ - err = ncmpio_read_write(ncp, NC_REQ_RD, coll_indep, offset, nelems, xtype, - xbuf, xtype_is_contig); - if (status == NC_NOERR) status = err; +#endif if (nelems > 0) { /* unpack xbuf into user buffer, buf */ @@ -557,7 +587,9 @@ err_check: if (status == NC_NOERR) status = err; } - if (xbuf != buf) NCI_Free(xbuf); + if (varp != NULL && xbuf != buf) + /* xbuf may be allocated only if this is a non-zero-sized request */ + NCI_Free(xbuf); return status; } @@ -599,7 +631,19 @@ ncmpio_$1_var(void *ncdp, { NC *ncp=(NC*)ncdp; - NC_var *varp=NULL; + NC_var *varp; + + /* Check if this is a true zero-sized request. Note NC_REQ_ZERO is added to + * reqMode only when an error is detected at the dispatcher level. + */ + if (!fIsSet(reqMode, NC_REQ_ZERO)) { + int i; + for (i=0; ivars.value[varid]->ndims; i++) + if (count[i] == 0) { + reqMode |= NC_REQ_ZERO; + break; + } + } /* sanity check has been done at dispatchers */ @@ -608,15 +652,22 @@ ncmpio_$1_var(void *ncdp, * write, they still need to participate the communication part of the * intra-node aggregation operation. */ - ifelse(`$1',`put',`if (ncp->my_aggr >= 0) - return $1_varm(ncp, NULL, NULL, NULL, NULL, imap, NULL, 0, buftype, reqMode);') +#ifdef ALWAYS_USE_INA + return $1_varm(ncp, NULL, NULL, NULL, NULL, imap, NULL, 0, + buftype, reqMode); +#else + if (ncp->num_aggrs_per_node > 0) + return $1_varm(ncp, NULL, NULL, NULL, NULL, imap, NULL, 0, + buftype, reqMode); /* this collective API has a zero-length request */ return ncmpio_getput_zero_req(ncp, reqMode); +#endif } /* obtain NC_var object pointer, varp. Note sanity check for ncdp and - * varid has been done in dispatchers */ + * varid has been done in dispatchers + */ varp = ncp->vars.value[varid]; #ifdef ENABLE_SUBFILING diff --git a/src/drivers/ncmpio/ncmpio_header_get.c b/src/drivers/ncmpio/ncmpio_header_get.c index 6ddd89bc10..bfda5592bc 100644 --- a/src/drivers/ncmpio/ncmpio_header_get.c +++ b/src/drivers/ncmpio/ncmpio_header_get.c @@ -34,7 +34,7 @@ static int compute_var_shape(NC *ncp) { - int i, err; + int i, err, last_fix; NC_var *first_var = NULL; /* first "non-record" var */ NC_var *first_rec = NULL; /* first "record" var */ @@ -44,6 +44,7 @@ compute_var_shape(NC *ncp) ncp->begin_rec = ncp->xsz; ncp->recsize = 0; + last_fix = -1; for (i=0; ivars.ndefined; i++) { /* ncp->vars.value[i]->len will be recomputed from dimensions in * ncmpio_NC_var_shape64() */ @@ -62,8 +63,14 @@ compute_var_shape(NC *ncp) */ ncp->begin_rec = ncp->vars.value[i]->begin + ncp->vars.value[i]->len; + last_fix = i; } } + if (last_fix >= 0) + ncp->fix_end = ncp->vars.value[last_fix]->begin + + ncp->vars.value[last_fix]->len; + else + ncp->fix_end = ncp->begin_var; if (first_rec != NULL) { if (ncp->begin_rec > first_rec->begin) @@ -316,103 +323,102 @@ hdr_len_NC_vararray(const NC_vararray *ncap, /*----< hdr_fetch() >--------------------------------------------------------*/ /* Fetch the next header chunk. The chunk buffer, pointed by gbp->base, is of - * size 'gbp->chunk' bytes. Be careful not to overwrite leftover (yet to be - * used) data in the buffer before fetching a new chunk. + * size 'gbp->ncp->hdr_chunk' bytes. Be careful not to overwrite leftover (yet + * to be used) data in the buffer before fetching a new chunk. */ static int hdr_fetch(bufferinfo *gbp) { - char *mpi_name; int rank, nprocs, err=NC_NOERR, mpireturn; - MPI_Status mpistatus; + PNCIO_View buf_view; assert(gbp->base != NULL); - MPI_Comm_size(gbp->comm, &nprocs); - MPI_Comm_rank(gbp->comm, &rank); + buf_view.count = 0; + buf_view.off = NULL; + buf_view.len = NULL; + buf_view.is_contig = 1; + buf_view.type = MPI_BYTE; + + MPI_Comm_size(gbp->ncp->comm, &nprocs); + MPI_Comm_rank(gbp->ncp->comm, &rank); if (rank == 0) { char *readBuf; int readLen; size_t slack; + MPI_Offset rlen; /* any leftover data in the buffer */ - slack = gbp->chunk - (gbp->pos - gbp->base); - if (slack == gbp->chunk) slack = 0; + slack = gbp->ncp->hdr_chunk - (gbp->pos - gbp->base); + if (slack == gbp->ncp->hdr_chunk) slack = 0; - /* When gbp->chunk == (gbp->pos - gbp->base), all data in the buffer has - * been consumed. If not, then read additional header of size - * (gbp->chunk - slack) into a contiguous buffer, pointed by gbp->base + - * slack. + /* When gbp->ncp->hdr_chunk == (gbp->pos - gbp->base), all data in the + * buffer has been consumed. If not, then read additional header of + * size (gbp->ncp->hdr_chunk - slack) into a contiguous buffer, pointed + * by gbp->base + slack. */ readBuf = gbp->base; - readLen = gbp->chunk; + readLen = gbp->ncp->hdr_chunk; if (slack > 0) { /* move slack to beginning of the buffer, gbp->base */ memmove(gbp->base, gbp->pos, slack); readBuf += slack; readLen -= slack; } - /* explicitly initialize mpistatus object to 0. For zero-length read, - * MPI_Get_count may report incorrect result for some MPICH version, - * due to the uninitialized MPI_Status object passed to MPI-IO calls. - */ - memset(&mpistatus, 0, sizeof(MPI_Status)); + buf_view.size = readLen; /* fileview is already entire file visible and MPI_File_read_at does not change the file pointer */ - if (gbp->coll_mode == 1) { /* collective read */ - TRACE_IO(MPI_File_read_at_all, (gbp->collective_fh, gbp->offset, readBuf, - readLen, MPI_BYTE, &mpistatus)); - } - else { - TRACE_IO(MPI_File_read_at, (gbp->collective_fh, gbp->offset, readBuf, - readLen, MPI_BYTE, &mpistatus)); - } - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (err == NC_EFILE) DEBUG_ASSIGN_ERROR(err, NC_EREAD) - } - else { - /* Obtain the actual read amount. It may be smaller than readLen, - * when the remaining file size is smaller than read chunk size. - * Because each MPI File_read reads amount of readLen bytes, and - * readLen <= read chunk size which is <= NC_MAX_INT, calling - * MPI_Get_count() is sufficient. No need to call MPI_Get_count_c() - */ - int get_size; - MPI_Get_count(&mpistatus, MPI_BYTE, &get_size); - gbp->get_size += get_size; - - /* If actual read amount is shorter than readLen, then we zero-out - * the remaining buffer. This is because the MPI_Bcast below - * broadcasts a buffer of a fixed size, gbp->chunk. Without zeroing + if (gbp->ncp->nprocs > 1 && fIsSet(gbp->ncp->flags, NC_HCOLL)) + /* collective read */ + rlen = ncmpio_file_read_at_all(gbp->ncp, gbp->offset, readBuf, + buf_view); + else + /* independent read */ + rlen = ncmpio_file_read_at(gbp->ncp, gbp->offset, readBuf, + buf_view); + + if (rlen > 0) { + /* rlen is the actual read amount. It may be smaller than readLen, + * when the remaining file size is smaller than readLen. When + * actual read amount is smaller than readLen, then we zero-out the + * remaining buffer. This is because the MPI_Bcast below broadcasts + * a buffer of a fixed size, gbp->ncp->hdr_chunk. Without zeroing * out, valgrind will complain about the uninitialized values. */ - if (get_size < readLen) - memset(readBuf + get_size, 0, readLen - get_size); + if (rlen < readLen) + memset(readBuf + rlen, 0, readLen - rlen); } + else if (rlen < 0) + err = (int)rlen; + /* only root process reads file header, keeps track of current read * file pointer location */ - gbp->offset += readLen; + gbp->offset += rlen; } - else if (gbp->coll_mode == 1) { /* collective read */ - /* other processes participate the collective call */ - TRACE_IO(MPI_File_read_at_all, (gbp->collective_fh, 0, NULL, - 0, MPI_BYTE, &mpistatus)); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (err == NC_EFILE) DEBUG_ASSIGN_ERROR(err, NC_EREAD) - } + else if (gbp->ncp->nprocs > 1 && fIsSet(gbp->ncp->flags, NC_HCOLL)) { + /* Collective read: non-root ranks participate the collective call with + * a zero-sized request. + */ + buf_view.type = MPI_BYTE; + buf_view.size = 0; + ncmpio_file_read_at_all(gbp->ncp, 0, NULL, buf_view); } - if (gbp->safe_mode == 1 && nprocs > 1) { - TRACE_COMM(MPI_Bcast)(&err, 1, MPI_INT, 0, gbp->comm); + if (fIsSet(gbp->ncp->flags, NC_MODE_SAFE) && nprocs > 1) { + TRACE_COMM(MPI_Bcast)(&err, 1, MPI_INT, 0, gbp->ncp->comm); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, "MPI_Bcast"); if (err != NC_NOERR) return err; } /* broadcast root's read (full or partial header) to other processes */ - if (nprocs > 1) - TRACE_COMM(MPI_Bcast)(gbp->base, gbp->chunk, MPI_BYTE, 0, gbp->comm); + if (nprocs > 1) { + TRACE_COMM(MPI_Bcast)(gbp->base, gbp->ncp->hdr_chunk, MPI_BYTE, 0, + gbp->ncp->comm); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, "MPI_Bcast"); + } gbp->pos = gbp->base; @@ -503,7 +509,7 @@ hdr_get_nc_type(bufferinfo *gbp, nc_type *xtypep) if (xtype < NC_BYTE) DEBUG_RETURN_ERROR(NC_EBADTYPE) - if (gbp->version < 5) { + if (gbp->ncp->format < 5) { if (xtype > NC_DOUBLE) DEBUG_RETURN_ERROR(NC_EBADTYPE) } @@ -536,7 +542,7 @@ hdr_get_NC_name(bufferinfo *gbp, char **namep, size_t *name_len) *namep = NULL; /* get nelems (string length of name) */ - if (gbp->version < 5) { + if (gbp->ncp->format < 5) { uint tmp; err = hdr_get_uint32(gbp, &tmp); if (err != NC_NOERR) return err; @@ -564,7 +570,7 @@ hdr_get_NC_name(bufferinfo *gbp, char **namep, size_t *name_len) */ padding = PNETCDF_RNDUP(nchars, X_ALIGN) - nchars; - bufremain = gbp->chunk - (gbp->pos - gbp->base); + bufremain = gbp->ncp->hdr_chunk - (gbp->pos - gbp->base); cpos = *namep; @@ -585,7 +591,7 @@ hdr_get_NC_name(bufferinfo *gbp, char **namep, size_t *name_len) *namep = NULL; return err; } - bufremain = gbp->chunk; + bufremain = gbp->ncp->hdr_chunk; } } @@ -659,7 +665,7 @@ hdr_get_NC_dim(bufferinfo *gbp, int unlimited_id, NC_dim **dimpp) else if (err != NC_NOERR) return err; /* get dim_length */ - if (gbp->version < 5) { + if (gbp->ncp->format < 5) { uint tmp; err = hdr_get_uint32(gbp, &tmp); dim_length = (MPI_Offset)tmp; @@ -730,7 +736,7 @@ hdr_get_NC_dimarray(bufferinfo *gbp, NC_dimarray *ncap) if (err != NC_NOERR) return err; /* read nelems (number of dimensions) from gbp buffer */ - if (gbp->version < 5) { /* nelems is */ + if (gbp->ncp->format < 5) { /* nelems is */ uint tmp; err = hdr_get_uint32(gbp, &tmp); if (err != NC_NOERR) return err; @@ -809,8 +815,8 @@ hdr_get_NC_attrV(bufferinfo *gbp, NC_attr *attrp) nbytes = attrp->nelems * xsz; padding = attrp->xsz - nbytes; - bufremain = gbp->chunk - (gbp->pos - gbp->base); - /* gbp->chunk is the read chunk size, which is of type 4-byte int. + bufremain = gbp->ncp->hdr_chunk - (gbp->pos - gbp->base); + /* gbp->ncp->hdr_chunk is the read chunk size, which is of type 4-byte int. * thus bufremain should be less than INT_MAX */ /* get values */ @@ -823,10 +829,9 @@ hdr_get_NC_attrV(bufferinfo *gbp, NC_attr *attrp) value = (void *)((char *)value + attcount); bufremain -= attcount; } else { - int err; err = hdr_fetch(gbp); if (err != NC_NOERR) return err; - bufremain = gbp->chunk; + bufremain = gbp->ncp->hdr_chunk; } } @@ -906,7 +911,7 @@ hdr_get_NC_attr(bufferinfo *gbp, NC_attr **attrpp) } /* get nelems */ - if (gbp->version < 5) { + if (gbp->ncp->format < 5) { uint tmp; err = hdr_get_uint32(gbp, &tmp); nelems = (MPI_Offset)tmp; @@ -977,7 +982,7 @@ hdr_get_NC_attrarray(bufferinfo *gbp, NC_attrarray *ncap) if (err != NC_NOERR) return err; /* read nelems (number of attributes) from gbp buffer */ - if (gbp->version < 5) { /* nelems is */ + if (gbp->ncp->format < 5) { /* nelems is */ uint tmp; err = hdr_get_uint32(gbp, &tmp); if (err != NC_NOERR) return err; @@ -1061,7 +1066,7 @@ hdr_get_NC_var(bufferinfo *gbp, else if (err != NC_NOERR) return err; /* nelems (number of dimensions) */ - if (gbp->version < 5) { + if (gbp->ncp->format < 5) { uint tmp; err = hdr_get_uint32(gbp, &tmp); if (err != NC_NOERR) { @@ -1099,7 +1104,7 @@ hdr_get_NC_var(bufferinfo *gbp, /* get [dimid ...] */ for (dim=0; dimversion < 5) { + if (gbp->ncp->format < 5) { uint tmp; err = hdr_get_uint32(gbp, &tmp); if (err != NC_NOERR) break; @@ -1135,7 +1140,7 @@ hdr_get_NC_var(bufferinfo *gbp, ncmpii_xlen_nc_type(varp->xtype, &varp->xsz); /* get vsize */ - if (gbp->version < 5) { + if (gbp->ncp->format < 5) { uint tmp; err = hdr_get_uint32(gbp, &tmp); varp->len = (MPI_Offset)tmp; @@ -1164,7 +1169,7 @@ hdr_get_NC_var(bufferinfo *gbp, */ /* get begin */ - if (gbp->version == 1) { + if (gbp->ncp->format == 1) { uint tmp; err = hdr_get_uint32(gbp, &tmp); varp->begin = (MPI_Offset)tmp; @@ -1223,7 +1228,7 @@ hdr_get_NC_vararray(bufferinfo *gbp, if (err != NC_NOERR) return err; /* read nelems (number of variables) from gbp buffer */ - if (gbp->version < 5) { /* nelems is */ + if (gbp->ncp->format < 5) { /* nelems is */ uint tmp; err = hdr_get_uint32(gbp, &tmp); if (err != NC_NOERR) return err; @@ -1339,24 +1344,15 @@ ncmpio_hdr_get_NC(NC *ncp) assert(ncp != NULL); /* Initialize the get buffer that stores the header read from the file */ - getbuf.comm = ncp->comm; - getbuf.collective_fh = ncp->collective_fh; - getbuf.get_size = 0; - getbuf.offset = 0; /* read from start of the file */ - getbuf.safe_mode = ncp->safe_mode; - if (ncp->nprocs > 1 && fIsSet(ncp->flags, NC_HCOLL)) - getbuf.coll_mode = 1; - else - getbuf.coll_mode = 0; - - /* CDF-5's minimum header size is 4 bytes more than CDF-1 and CDF-2's */ - getbuf.chunk = PNETCDF_RNDUP( MAX(MIN_NC_XSZ+4, ncp->chunk), X_ALIGN ); - - getbuf.base = (char*) NCI_Malloc(getbuf.chunk); - getbuf.pos = getbuf.base; - getbuf.end = getbuf.base + getbuf.chunk; - - /* Fetch the next header chunk. The chunk is 'gbp->chunk' bytes big */ + getbuf.ncp = ncp; + getbuf.offset = 0; /* read from start of the file */ + getbuf.base = (char*) NCI_Malloc(getbuf.ncp->hdr_chunk); + getbuf.pos = getbuf.base; + getbuf.end = getbuf.base + getbuf.ncp->hdr_chunk; + + /* Fetch the next header chunk. The chunk is 'gbp->ncp->hdr_chunk' bytes + * big. + */ err = hdr_fetch(&getbuf); if (err != NC_NOERR) return err; @@ -1373,7 +1369,7 @@ ncmpio_hdr_get_NC(NC *ncp) ncmpix_getn_text((const void **)(&getbuf.pos), 8, signature); if (memcmp(signature, hdf5_signature, 8) == 0) { DEBUG_ASSIGN_ERROR(err, NC_ENOTNC3) - if (ncp->safe_mode) + if (fIsSet(ncp->flags, NC_MODE_SAFE)) fprintf(stderr,"Error: file %s is HDF5 format\n",ncp->path); } else @@ -1381,20 +1377,20 @@ ncmpio_hdr_get_NC(NC *ncp) goto fn_exit; } - /* check version number in last byte of magic */ - if (magic[3] == 0x1) { - getbuf.version = ncp->format = 1; - } else if (magic[3] == 0x2) { - getbuf.version = ncp->format = 2; - } else if (magic[3] == 0x5) { - getbuf.version = ncp->format = 5; - } else { + /* check format version number in last byte of magic */ + if (magic[3] == 0x1) + ncp->format = 1; + else if (magic[3] == 0x2) + ncp->format = 2; + else if (magic[3] == 0x5) + ncp->format = 5; + else { NCI_Free(getbuf.base); DEBUG_RETURN_ERROR(NC_ENOTNC) /* not a netCDF file */ } /* get numrecs from getbuf into ncp */ - if (getbuf.version < 5) { + if (getbuf.ncp->format < 5) { uint tmp=0; err = hdr_get_uint32(&getbuf, &tmp); if (err != NC_NOERR) goto fn_exit; @@ -1449,7 +1445,6 @@ ncmpio_hdr_get_NC(NC *ncp) if (err != NC_NOERR) goto fn_exit; fn_exit: - ncp->get_size += getbuf.get_size; NCI_Free(getbuf.base); return (err == NC_NOERR) ? status : err; diff --git a/src/drivers/ncmpio/ncmpio_header_put.c b/src/drivers/ncmpio/ncmpio_header_put.c index 8daf88c678..54f5f63680 100644 --- a/src/drivers/ncmpio/ncmpio_header_put.c +++ b/src/drivers/ncmpio/ncmpio_header_put.c @@ -49,7 +49,7 @@ hdr_put_NC_name(bufferinfo *pbp, size_t nchars = strlen(name); /* copy nelems */ - if (pbp->version < 5) + if (pbp->ncp->format < 5) err = ncmpix_put_uint32((void**)(&pbp->pos), (uint)nchars); else err = ncmpix_put_uint64((void**)(&pbp->pos), (uint64)nchars); @@ -78,7 +78,7 @@ hdr_put_NC_dim(bufferinfo *pbp, if (err != NC_NOERR) return err; /* copy dim_length */ - if (pbp->version < 5) { + if (pbp->ncp->format < 5) { /* TODO: Isn't checking dimension size already done in def_dim()? */ if (dimp->size > NC_MAX_INT) DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) @@ -116,7 +116,7 @@ hdr_put_NC_dimarray(bufferinfo *pbp, if (status != NC_NOERR) return status; /* put a ZERO or ZERO64 depending on which CDF format */ - if (pbp->version < 5) + if (pbp->ncp->format < 5) status = ncmpix_put_uint32((void**)(&pbp->pos), 0); else status = ncmpix_put_uint64((void**)(&pbp->pos), 0); @@ -128,7 +128,7 @@ hdr_put_NC_dimarray(bufferinfo *pbp, if (status != NC_NOERR) return status; /* copy nelems */ - if (pbp->version < 5) + if (pbp->ncp->format < 5) status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)ncap->ndefined); else status = ncmpix_put_uint64((void**)(&pbp->pos), (uint64)ncap->ndefined); @@ -175,7 +175,7 @@ hdr_put_NC_attrV(bufferinfo *pbp, sz = attrp->nelems * xsz; padding = attrp->xsz - sz; - if (pbp->version < 5 && sz > NC_MAX_INT) + if (pbp->ncp->format < 5 && sz > NC_MAX_INT) DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) memcpy(pbp->pos, attrp->xvalue, (size_t)sz); @@ -214,7 +214,7 @@ hdr_put_NC_attr(bufferinfo *pbp, if (status != NC_NOERR) return status; /* copy nelems */ - if (pbp->version < 5) { + if (pbp->ncp->format < 5) { if (attrp->nelems > NC_MAX_INT) DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)attrp->nelems); @@ -258,7 +258,7 @@ hdr_put_NC_attrarray(bufferinfo *pbp, if (status != NC_NOERR) return status; /* put a ZERO or ZERO64 depending on which CDF format */ - if (pbp->version < 5) + if (pbp->ncp->format < 5) status = ncmpix_put_uint32((void**)(&pbp->pos), 0); else status = ncmpix_put_uint64((void**)(&pbp->pos), 0); @@ -270,7 +270,7 @@ hdr_put_NC_attrarray(bufferinfo *pbp, if (status != NC_NOERR) return status; /* copy nelems */ - if (pbp->version < 5) + if (pbp->ncp->format < 5) status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)ncap->ndefined); else status = ncmpix_put_uint64((void**)(&pbp->pos), (uint64)ncap->ndefined); @@ -314,7 +314,7 @@ hdr_put_NC_var(bufferinfo *pbp, if (status != NC_NOERR) return status; /* copy nelems */ - if (pbp->version < 5) + if (pbp->ncp->format < 5) status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)varp->ndims); else status = ncmpix_put_uint64((void**)(&pbp->pos), (uint64)varp->ndims); @@ -322,7 +322,7 @@ hdr_put_NC_var(bufferinfo *pbp, /* copy [dimid ...] */ for (i=0; indims; i++) { - if (pbp->version < 5) + if (pbp->ncp->format < 5) status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)varp->dimids[i]); else status = ncmpix_put_uint64((void**)(&pbp->pos), (uint64)varp->dimids[i]); @@ -341,7 +341,7 @@ hdr_put_NC_var(bufferinfo *pbp, /* in CDF-1 and CDF-2, a variable's size in the header is a 32-bit integer * in CDF-5, it is a 64-bit integer */ - if (pbp->version < 5) { + if (pbp->ncp->format < 5) { /* Special case, when there is no record variable, the last fixed-size * variable can be larger than 2 GiB if its file starting offset is * less than 2 GiB. This checking has already been done in the call @@ -367,7 +367,7 @@ hdr_put_NC_var(bufferinfo *pbp, /* in CDF-1 header, a variable's starting file offset is a 32-bit integer * in CDF-2 and CDF-5, it is a 64-bit integer */ - if (pbp->version == 1) { + if (pbp->ncp->format == 1) { if (varp->begin > NC_MAX_INT) DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)varp->begin); @@ -407,7 +407,7 @@ hdr_put_NC_vararray(bufferinfo *pbp, if (status != NC_NOERR) return status; /* put a ZERO or ZERO64 depending on which CDF format */ - if (pbp->version < 5) + if (pbp->ncp->format < 5) status = ncmpix_put_uint32((void**)(&pbp->pos), 0); else status = ncmpix_put_uint64((void**)(&pbp->pos), 0); @@ -419,7 +419,7 @@ hdr_put_NC_vararray(bufferinfo *pbp, if (status != NC_NOERR) return status; /* copy nelems */ - if (pbp->version < 5) + if (pbp->ncp->format < 5) status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)ncap->ndefined); else status = ncmpix_put_uint64((void**)(&pbp->pos), (uint64)ncap->ndefined); @@ -441,20 +441,14 @@ hdr_put_NC_vararray(bufferinfo *pbp, int ncmpio_hdr_put_NC(NC *ncp, void *buf) { - int status; + int err; bufferinfo putbuf; MPI_Offset nrecs=0; - putbuf.comm = ncp->comm; - putbuf.collective_fh = ncp->collective_fh; - putbuf.offset = 0; - putbuf.pos = buf; - putbuf.base = buf; - putbuf.safe_mode = ncp->safe_mode; - if (ncp->nprocs > 1 && fIsSet(ncp->flags, NC_HCOLL)) - putbuf.coll_mode = 1; - else - putbuf.coll_mode = 0; + putbuf.ncp = ncp; + putbuf.offset = 0; + putbuf.pos = buf; + putbuf.base = buf; /* netCDF file format: * netcdf_file = header data @@ -462,43 +456,37 @@ ncmpio_hdr_put_NC(NC *ncp, void *buf) */ /* copy "magic", 4 characters */ - if (ncp->format == 5) { - putbuf.version = 5; - status = ncmpix_putn_text((void **)(&putbuf.pos), sizeof(ncmagic5), ncmagic5); - } - else if (ncp->format == 2) { - putbuf.version = 2; - status = ncmpix_putn_text((void **)(&putbuf.pos), sizeof(ncmagic2), ncmagic2); - } - else { - putbuf.version = 1; - status = ncmpix_putn_text((void **)(&putbuf.pos), sizeof(ncmagic1), ncmagic1); - } - if (status != NC_NOERR) return status; + if (ncp->format == 5) + err = ncmpix_putn_text((void **)(&putbuf.pos), sizeof(ncmagic5), ncmagic5); + else if (ncp->format == 2) + err = ncmpix_putn_text((void **)(&putbuf.pos), sizeof(ncmagic2), ncmagic2); + else + err = ncmpix_putn_text((void **)(&putbuf.pos), sizeof(ncmagic1), ncmagic1); + if (err != NC_NOERR) return err; /* copy numrecs, number of records */ nrecs = ncp->numrecs; if (ncp->format < 5) { if (nrecs > NC_MAX_INT) DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) - status = ncmpix_put_uint32((void**)(&putbuf.pos), (uint)nrecs); + err = ncmpix_put_uint32((void**)(&putbuf.pos), (uint)nrecs); } else { - status = ncmpix_put_uint64((void**)(&putbuf.pos), (uint64)nrecs); + err = ncmpix_put_uint64((void**)(&putbuf.pos), (uint64)nrecs); } - if (status != NC_NOERR) return status; + if (err != NC_NOERR) return err; /* copy dim_list */ - status = hdr_put_NC_dimarray(&putbuf, &ncp->dims); - if (status != NC_NOERR) return status; + err = hdr_put_NC_dimarray(&putbuf, &ncp->dims); + if (err != NC_NOERR) return err; /* copy gatt_list */ - status = hdr_put_NC_attrarray(&putbuf, &ncp->attrs); - if (status != NC_NOERR) return status; + err = hdr_put_NC_attrarray(&putbuf, &ncp->attrs); + if (err != NC_NOERR) return err; /* copy var_list */ - status = hdr_put_NC_vararray(&putbuf, &ncp->vars); - if (status != NC_NOERR) return status; + err = hdr_put_NC_vararray(&putbuf, &ncp->vars); + if (err != NC_NOERR) return err; return NC_NOERR; } @@ -514,11 +502,12 @@ ncmpio_hdr_put_NC(NC *ncp, void *buf) */ int ncmpio_write_header(NC *ncp) { - char *mpi_name; - int status=NC_NOERR, mpireturn, err; + int status=NC_NOERR, mpireturn; size_t i, ntimes; - MPI_File fh; - MPI_Status mpistatus; + PNCIO_View buf_view; + + buf_view.count = 1; + buf_view.is_contig = 1; /* Write the entire header to the file. This function may be called from * a rename API. In that case, we cannot just change the variable name in @@ -526,10 +515,6 @@ int ncmpio_write_header(NC *ncp) * all metadata following the new name must be moved ahead. */ - fh = ncp->collective_fh; - if (NC_indep(ncp)) /* called in independent data mode */ - fh = ncp->independent_fh; - /* update file header size, as this subroutine may be called from a rename * API (var or attribute) and the new name is smaller/bigger which changes * the header size. We recalculate ncp->xsz by getting the un-aligned size @@ -555,56 +540,31 @@ int ncmpio_write_header(NC *ncp) buf_ptr = buf; for (i=0; iflags, NC_HCOLL)) /* header collective write */ + wlen = ncmpio_file_write_at_all(ncp, offset, buf_ptr, buf_view); + else /* header independent write */ + wlen = ncmpio_file_write_at(ncp, offset, buf_ptr, buf_view); + if (status == NC_NOERR && wlen < 0) status = (int)wlen; - /* explicitly initialize mpistatus object to 0. For zero-length - * read, MPI_Get_count may report incorrect result for some MPICH - * version, due to the uninitialized MPI_Status object passed to - * MPI-IO calls. Thus we initialize it above to work around. - */ - memset(&mpistatus, 0, sizeof(MPI_Status)); - - if (fIsSet(ncp->flags, NC_HCOLL)) { /* header collective write */ - TRACE_IO(MPI_File_write_at_all, (fh, offset, buf_ptr, writeLen, - MPI_BYTE, &mpistatus)); - } - else { /* header independent write */ - TRACE_IO(MPI_File_write_at, (fh, offset, buf_ptr, writeLen, - MPI_BYTE, &mpistatus)); - } - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (status == NC_NOERR) { - err = (err == NC_EFILE) ? NC_EWRITE : err; - DEBUG_ASSIGN_ERROR(status, err) - } - } - else { - /* update the number of bytes written since file open. - * Because each MPI write writes no more than NC_MAX_INT, - * calling MPI_Get_count() is sufficient. No need to call - * MPI_Get_count_c() - */ - int put_size; - mpireturn = MPI_Get_count(&mpistatus, MPI_BYTE, &put_size); - if (mpireturn != MPI_SUCCESS || put_size == MPI_UNDEFINED) - ncp->put_size += ncp->xsz; - else - ncp->put_size += writeLen; - } offset += writeLen; buf_ptr += writeLen; remain -= writeLen; } NCI_Free(buf); } - else if (fIsSet(ncp->flags, NC_HCOLL)) { /* header collective write */ - /* collective write: other processes participate the collective call */ - for (i=0; inprocs > 1 && fIsSet(ncp->flags, NC_HCOLL)) { + /* collective write: non-root ranks participate the collective call */ + buf_view.type = MPI_BYTE; + buf_view.size = 0; + ncmpio_file_write_at_all(ncp, 0, NULL, buf_view); } - if (ncp->safe_mode == 1) { + if (fIsSet(ncp->flags, NC_MODE_SAFE)) { /* broadcast root's status, because only root writes to the file */ int root_status = status; TRACE_COMM(MPI_Bcast)(&root_status, 1, MPI_INT, 0, ncp->comm); diff --git a/src/drivers/ncmpio/ncmpio_i_getput.m4 b/src/drivers/ncmpio/ncmpio_i_getput.m4 index 7f624207d3..b5b4f09db7 100644 --- a/src/drivers/ncmpio/ncmpio_i_getput.m4 +++ b/src/drivers/ncmpio/ncmpio_i_getput.m4 @@ -122,6 +122,11 @@ ncmpio_add_record_requests(NC_lead_req *lead_list, reqs[i].lead_off = reqs[0].lead_off; reqs[i].xbuf = xbuf; xbuf += rec_bufsize; + + /* copy the number of flattened offset-length pairs */ + reqs[i].npairs = reqs[0].npairs; + reqs[i].offset_start = reqs[0].offset_start; + reqs[i].offset_end = reqs[0].offset_end; } return NC_NOERR; @@ -142,7 +147,7 @@ ncmpio_igetput_varm(NC *ncp, int reqMode) { void *xbuf=NULL; - int i, err=NC_NOERR, abuf_index=-1, isize, xsize, new_nreqs, rem; + int i, j, err=NC_NOERR, abuf_index=-1, isize, xsize, new_nreqs, rem; int mpireturn, buftype_is_contig=1, need_convert, free_xbuf=0; int need_swap, can_swap_in_place, need_swap_back_buf=0; MPI_Offset nelems=0, nbytes, *ptr; @@ -520,9 +525,13 @@ ncmpio_igetput_varm(NC *ncp, } /* allocate a single array for non-leads to store start/count/stride */ + req->npairs = 0; if (varp->ndims == 0) { /* scalar variable, start may be NULL */ lead_req->start = NULL; req->start = NULL; + req->npairs = 1; + req->offset_start = 0; /* relative to var's begin */ + req->offset_end = varp->xsz; } else if (stride == NULL) { size_t memChunk = varp->ndims * SIZEOF_MPI_OFFSET; @@ -536,6 +545,12 @@ ncmpio_igetput_varm(NC *ncp, memcpy(ptr, start, memChunk); ptr += varp->ndims; memcpy(ptr, count, memChunk); + + /* calculate number of flattened offset-length pairs */ + req->npairs = 1; + j = IS_RECVAR(varp) ? 1 : 0; + for (i=j; indims-1; i++) + req->npairs *= count[i]; } else { size_t memChunk = varp->ndims * SIZEOF_MPI_OFFSET; @@ -551,12 +566,24 @@ ncmpio_igetput_varm(NC *ncp, memcpy(ptr, count, memChunk); ptr += varp->ndims; memcpy(ptr, stride, memChunk); + + /* calculate number of flattened offset-length pairs */ + req->npairs = (stride[varp->ndims-1] == 1) ? 1 : count[varp->ndims-1]; + j = IS_RECVAR(varp) ? 1 : 0; + for (i=j; indims-1; i++) + req->npairs *= count[i]; } /* set the properties of non-lead request */ req->xbuf = xbuf; req->nelems = nelems; + /* special treatment when there is only one offset-length pair */ + if (req->npairs == 1 && varp->ndims > 0) { + ncmpio_calc_off(ncp, varp, start, &req->offset_start); + req->offset_end = req->nelems * varp->xsz; + } + if (IS_RECVAR(varp)) { /* save the last record number accessed */ if (stride == NULL) @@ -576,6 +603,8 @@ ncmpio_igetput_varm(NC *ncp, : ncp->get_lead_list; req->nelems /= count[0]; + if (req->npairs == 1) + req->offset_end = req->nelems * varp->xsz; /* add (count[0]-1) number of (sub)requests */ ncmpio_add_record_requests(lead_list, req, count[0], stride); diff --git a/src/drivers/ncmpio/ncmpio_i_varn.m4 b/src/drivers/ncmpio/ncmpio_i_varn.m4 index be9af9752c..8bad268f01 100644 --- a/src/drivers/ncmpio/ncmpio_i_varn.m4 +++ b/src/drivers/ncmpio/ncmpio_i_varn.m4 @@ -452,6 +452,12 @@ igetput_varn(NC *ncp, lead_req->max_rec = -1; lead_req->nonlead_num = new_nreqs; +#if 0 +MPI_Aint addr; +MPI_Get_address(lead_req->xbuf, &addr); +printf("%s at %d: lead_req xbuf=%ld nelems=%lld\n",__func__,__LINE__, addr,lead_req->nelems); +#endif + /* varn APIs have no argument stride */ fSet(lead_req->flag, NC_REQ_STRIDE_NULL); @@ -466,6 +472,8 @@ igetput_varn(NC *ncp, xbufp = (char*)xbuf; for (i=0; inpairs = 0; + if (req_nelems[i] == 0) continue; /* ignore this 0-length request i */ req->nelems = req_nelems[i]; @@ -473,11 +481,17 @@ igetput_varn(NC *ncp, req->xbuf = xbufp; xbufp += req_nelems[i] * xsize; +#if 0 +MPI_Get_address(req->xbuf, &addr); +printf("%s at %d: req i=%d xbuf=%ld off=%ld nelems=%lld\n",__func__,__LINE__, i,addr,(char*)req->xbuf - (char*)xbuf,req->nelems); +#endif + /* copy starts[i] and counts[i] over to req */ req->start = start_ptr; memcpy(start_ptr, starts[i], memChunk); start_ptr += varp->ndims; /* count[] */ if (counts == NULL || counts[i] == NULL) { + /* counts == NULL, equivalent to all 1s */ for (j=0; jndims; j++) start_ptr[j] = 1; /* start_ptr is now counts[] */ } @@ -492,6 +506,24 @@ igetput_varn(NC *ncp, if (counts == NULL || counts[i] == NULL) num_rec = 1; else num_rec = counts[i][0]; + /* calculate number of flattened offset-length pairs */ + req->npairs = 1; + if (counts == NULL || counts[i] == NULL) { + /* equivalent to all multiple var1 APIs */ + ncmpio_calc_off(ncp, varp, starts[i], &req->offset_start); + // req->offset_end = req->offset_start + varp->xsz; + req->offset_end = varp->xsz; + } + else { + for (j=1; jndims-1; j++) + req->npairs *= counts[i][j]; + /* special treatment for when there is only one pair */ + if (req->npairs == 1) { + ncmpio_calc_off(ncp, varp, starts[i], &req->offset_start); + req->offset_end = req->nelems * varp->xsz; + } + } + max_rec = starts[i][0] + num_rec; lead_req->max_rec = MAX(lead_req->max_rec, max_rec); @@ -506,6 +538,11 @@ igetput_varn(NC *ncp, lead_list = (fIsSet(reqMode, NC_REQ_WR)) ? ncp->put_lead_list : ncp->get_lead_list; + + req->nelems /= counts[i][0]; + if (req->npairs == 1) + req->offset_end = req->nelems * varp->xsz; + /* append (counts[i][0]-1) number of requests to the queue */ ncmpio_add_record_requests(lead_list, req, counts[i][0], NULL); start_ptr += (counts[i][0] - 1) * 2 * varp->ndims; @@ -514,8 +551,26 @@ igetput_varn(NC *ncp, else req++; } - else + else { + /* calculate number of flattened offset-length pairs */ + req->npairs = 1; + if (counts == NULL || counts[i] == NULL) { + /* equivalent to all multiple var1 APIs */ + ncmpio_calc_off(ncp, varp, starts[i], &req->offset_start); + // req->offset_end = req->offset_start + varp->xsz; + req->offset_end = varp->xsz; + } + else { + for (j=0; jndims-1; j++) + req->npairs *= counts[i][j]; + /* special treatment for when there is only one pair */ + if (req->npairs == 1) { + ncmpio_calc_off(ncp, varp, starts[i], &req->offset_start); + req->offset_end = req->nelems * varp->xsz; + } + } req++; + } } if (reqid != NULL) *reqid = lead_req->id; diff --git a/src/drivers/ncmpio/ncmpio_intra_node.c b/src/drivers/ncmpio/ncmpio_intra_node.c index 90d613eb47..f85c54ca48 100644 --- a/src/drivers/ncmpio/ncmpio_intra_node.c +++ b/src/drivers/ncmpio/ncmpio_intra_node.c @@ -3,31 +3,40 @@ * See COPYRIGHT notice in top-level directory. * * This file contains the implementation of intra-node aggregation feature, - * which is designed for the I/O patterns that contain many noncontiguous - * requests interleaved among processes, and spreading across a wide range of - * file space. It is particularly useful when the number of MPI processes - * allocated to a compute node is large. + * which is designed to improve performance for I/O patterns that contain many + * noncontiguous requests interleaved among processes, with a wide aggregate + * access region on each process that involves file stripes responsible by + * almost all the file servers. By reducing the number of processes per node + * to participate MPI-IO operations, this feature can effectively reduce the + * communication contention, particularly often happened to jobs that run a + * large the number of MPI processes per compute node. * - * This feature is enabled by setting the PnetCDF hint 'nc_num_aggrs_per_node' - * to a positive integral value indicating the desired number of processes per - * compute node to be selected as the intra-node I/O aggregators. Each process - * is assigned a unique aggregator. The non-aggregators send their requests to - * the assigned aggregators, and then the aggregators make MPI-IO requests to - * the file. + * Users can enable this feature by setting the PnetCDF I/O hint named + * 'nc_num_aggrs_per_node' to a positive integral value, indicating the desired + * number of processes per compute node to be selected as the intra-node I/O + * aggregators. Processes running on the same node are divided into groups. + * The process with the lowest rank ID is selected as the I/O aggregator of + * that group. Non-aggregators send their requests to their aggregators, and + * then the aggregators make I/O requests to the file, i.e. only aggregators + * make MPI-IO calls. * - * Such strategy can effectively reduce communication congestion due to many - * pending asynchronous messages produced in the collective write inside of - * MPI-IO. + * Because communication within a node can be achieved by memory copy operation + * and thus its cost is expected to be much lower than the inter-node + * communication, this feature can effectively reduce the communication + * congestion or exhaustion of message queues, due to many pending asynchronous + * messages produced in the two-phase I/O, the strategy used to implement + * MPI collective I/O. * - * The concept of intra-node request aggregation is based on the paper: + * The concept of intra-node request aggregation and its performance results + * are presented in the following paper. * Q. Kang, S. Lee, K. Hou, R. Ross, A. Agrawal, A. Choudhary, and W. Liao. * Improving MPI Collective I/O for High Volume Non-Contiguous Requests With * Intra-Node Aggregation. IEEE Transactions on Parallel and Distributed - * Systems (TPDS), 31(11):2682-2695, November 2020. + * Systems, 31(11):2682-2695, November 2020. */ #ifdef HAVE_CONFIG_H -# include +#include #endif #include @@ -41,28 +50,34 @@ #include #include "ncmpio_NC.h" +/* swap values of x and y */ +#define SWAP1(x, y, tmp) { tmp = x ; x = y; y = tmp ; } + #ifdef HAVE_MPI_LARGE_COUNT +/* swap elements of arrays x, y, and corresponding lengths and bufAddr */ #define SWAP(offsets, lengths, bufAddr, x, y) { \ MPI_Count aint; \ MPI_Count cint; \ MPI_Count d0 = (x) - offsets; \ MPI_Count d1 = (y) - offsets; \ if (d0 != d1) { \ - cint = *(x) ; *(x) = *(y) ; *(y) = cint ; \ - cint = lengths[d0] ; lengths[d0] = lengths[d1] ; lengths[d1] = cint ; \ - aint = bufAddr[d0] ; bufAddr[d0] = bufAddr[d1] ; bufAddr[d1] = aint ; \ + SWAP1(*(x), *(y), cint); \ + SWAP1(lengths[d0], lengths[d1], cint); \ + if (bufAddr != NULL) \ + SWAP1(bufAddr[d0], bufAddr[d1], aint); \ } \ } #else #define SWAP(offsets, lengths, bufAddr, x, y) { \ int int4; \ - MPI_Aint aint; \ - MPI_Aint d0 = (x) - offsets; \ - MPI_Aint d1 = (y) - offsets; \ + MPI_Offset aint; \ + MPI_Offset d0 = (x) - offsets; \ + MPI_Offset d1 = (y) - offsets; \ if (d0 != d1) { \ - aint = *(x) ; *(x) = *(y) ; *(y) = aint ; \ - int4 = lengths[d0] ; lengths[d0] = lengths[d1] ; lengths[d1] = int4 ; \ - aint = bufAddr[d0] ; bufAddr[d0] = bufAddr[d1] ; bufAddr[d1] = aint ; \ + SWAP1(*(x), *(y), aint); \ + SWAP1(lengths[d0], lengths[d1], int4); \ + if (bufAddr != NULL) \ + SWAP1(bufAddr[d0], bufAddr[d1], aint); \ } \ } #endif @@ -71,28 +86,36 @@ ((*(b) < *(c)) ? (b) : ((*(a) < *(c)) ? (c) : (a))) : \ ((*(b) > *(c)) ? (b) : ((*(a) < *(c)) ? (a) : (c)))) +static +size_t bin_search( +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count key, MPI_Count *base, +#else + MPI_Offset key, MPI_Offset *base, +#endif + size_t nmemb); + /*----< qsort_off_len_buf() >------------------------------------------------*/ -/* Sort three arrays of offsets, lengths, and buffer addresses based on the - * increasing order of offsets. This code is based on the qsort routine from - * Bentley & McIlroy's "Engineering a Sort Function". +/* Sort three arrays of offsets, lengths, and buffer addresses based on array + * offsets into an increasing order. This code is based on the qsort routine + * from Bentley & McIlroy's "Engineering a Sort Function". */ static void -qsort_off_len_buf(MPI_Aint num, +qsort_off_len_buf(MPI_Aint num, #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count *offsets, - MPI_Count *lengths, + MPI_Count *offsets, + MPI_Count *lengths, #else - MPI_Aint *offsets, - int *lengths, + MPI_Offset *offsets, + int *lengths, #endif - MPI_Aint *bufAddr) + MPI_Aint *bufAddr) { #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count *pa, *pb, *pc, *pd, *pl, *pm, *pn, cmp_result, swap_cnt; + MPI_Count *pa, *pb, *pc, *pd, *pl, *pm, *pn, cmp_result, swap_cnt, i, r; #else - MPI_Aint *pa, *pb, *pc, *pd, *pl, *pm, *pn, cmp_result, swap_cnt; + MPI_Offset *pa, *pb, *pc, *pd, *pl, *pm, *pn, cmp_result, swap_cnt, i, r; #endif - MPI_Aint i, r; while (1) { swap_cnt = 0; @@ -155,7 +178,8 @@ qsort_off_len_buf(MPI_Aint num, if ((r = pd - pc) > 1) { /* Iterate rather than recursively call self to save stack space */ lengths = lengths + (num - r); - bufAddr = bufAddr + (num - r); + if (bufAddr != NULL) + bufAddr = bufAddr + (num - r); offsets = pn - r; num = r; } @@ -164,174 +188,239 @@ qsort_off_len_buf(MPI_Aint num, } } -/*----< ncmpio_init_intra_node_aggr() >--------------------------------------*/ -/* When intra-node write aggregation is enabled, processes on the same node - * will be divided into groups. The number of groups is the number of - * aggregators on that node. The rank IDs of each group must be established. +/*----< heap_merge() >-------------------------------------------------------*/ +/* Heapify(a, i, heapsize); Algorithm from Cormen et al. pg. 143 modified for a + * heap with smallest element at root. The recursion has been removed so that + * there are no function calls. Function calls are too expensive. * - * 1. Find information about MPI processes and their affinity to compute node. - * 2. Determine whether self process is an intra-node aggregator. - * 3. For an aggregator, find the number of non-aggregators assigned to it and - * construct rank IDs of assigned non-aggregators. - * 4. For a non-aggregator, find the rank ID of its assigned aggregator. + * Requirement: all individual offsets lists must be already sorted !!! */ -int -ncmpio_intra_node_aggr_init(NC *ncp) +static +void heap_merge(int nprocs, + const MPI_Aint *count, /* [nprocs] */ + MPI_Aint nelems, +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *offsets, /* [nelems] */ + MPI_Count *blklens, /* [nelems] */ +#else + MPI_Offset *offsets, /* [nelems] */ + int *blklens, /* [nelems] */ +#endif + MPI_Aint *bufAddr) /* [nelems] */ { - char my_procname[MPI_MAX_PROCESSOR_NAME], **all_procnames=NULL; - int i, j, k, my_procname_len, num_nodes, root=0; - int *node_ids=NULL, *all_procname_lens=NULL, *nprocs_per_node; - int naggrs_my_node, num_nonaggrs; - int my_rank_index, *ranks_my_node, my_node_id, nprocs_my_node; - - /* initialize parameters of local-node aggregation */ - ncp->my_aggr = -1; /* rank ID of my aggregator */ - ncp->num_nonaggrs = 0; /* number of non-aggregators assigned */ - ncp->nonaggr_ranks = NULL; /* ranks of assigned non-aggregators */ - -#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) - ncp->aggr_time = 0.0; + typedef struct { +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *off_list; + MPI_Count *len_list; +#else + MPI_Offset *off_list; + int *len_list; #endif + MPI_Aint *addr_list; + MPI_Aint count; + } heap_struct; - if (ncp->num_aggrs_per_node == 0 || ncp->num_aggrs_per_node == ncp->nprocs) - /* disable intra-node aggregation */ - return NC_NOERR; + heap_struct *a, tmp; + int i, j, heapsize, l, r, k, smallest; + size_t sum; -#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) - double timing = MPI_Wtime(); + /* This heap_merge is not in-place, taking too much memory footprint */ +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *srt_off = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * nelems); + MPI_Count *srt_len = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * nelems); +#else + MPI_Aint *srt_off = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * nelems); + int *srt_len = (int*) NCI_Malloc(sizeof(int) * nelems); #endif + MPI_Aint *srt_addr = NULL; + + if (bufAddr != NULL) + srt_addr = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * nelems); + + a = (heap_struct *) NCI_Calloc(nprocs, sizeof(heap_struct)); + + /* there are nprocs number of lists to be merged */ + j = 0; + sum = 0; + for (i = 0; i < nprocs; i++) { + if (count[i]) { + /* each of a[j].off_list is already sorted */ + a[j].off_list = offsets + sum; + a[j].len_list = blklens + sum; + if (bufAddr != NULL) + a[j].addr_list = bufAddr + sum; + sum += count[i]; + a[j].count = count[i]; + j++; + } + } + nprocs = j; /* some count[i] may be zero */ - /* allocate space for storing the rank IDs of non-aggregators assigned to - * this rank. Note ncp->nonaggr_ranks[] will be freed when closing the - * file, if allocated. - */ - num_nonaggrs = ncp->nprocs / ncp->num_aggrs_per_node + 1; - ncp->nonaggr_ranks = (int*) NCI_Malloc(sizeof(int) * num_nonaggrs); +#define SWAP_HEAP(x, y, tmp) { tmp = x ; x = y ; y = tmp ; } + + heapsize = nprocs; - /* Collect info about compute nodes in order to select I/O aggregators. - * Note my_procname is null character terminated, but my_procname_len - * does not include the null character. + /* Build a heap out of the first element from each list, with the smallest + * element of the heap at the root. The first for loop is to find and move + * the smallest a[*].off_list[0] to a[0]. */ - MPI_Get_processor_name(my_procname, &my_procname_len); - my_procname_len++; /* to include terminate null character */ + for (i = heapsize / 2 - 1; i >= 0; i--) { + k = i; + for (;;) { + r = 2 * (k + 1); + l = r - 1; + if (l < heapsize && a[l].off_list[0] < a[k].off_list[0]) + smallest = l; + else + smallest = k; - if (ncp->rank == root) { - /* root collects all procnames */ - all_procnames = (char **) NCI_Malloc(sizeof(char*) * ncp->nprocs); - if (all_procnames == NULL) - DEBUG_RETURN_ERROR(NC_ENOMEM) + if (r < heapsize && a[r].off_list[0] < a[smallest].off_list[0]) + smallest = r; - all_procname_lens = (int *) NCI_Malloc(sizeof(int) * ncp->nprocs); - if (all_procname_lens == NULL) { - NCI_Free(all_procnames); - DEBUG_RETURN_ERROR(NC_ENOMEM) + if (smallest != k) { + SWAP_HEAP(a[k], a[smallest], tmp); + k = smallest; + } else + break; } } - /* gather process name lengths from all processes first */ - MPI_Gather(&my_procname_len, 1, MPI_INT, all_procname_lens, 1, MPI_INT, - root, ncp->comm); - if (ncp->rank == root) { - int *disp; - size_t alloc_size = 0; + /* The heap keeps the smallest element in its first element, i.e. + * a[0].off_list[0]. + */ + j = 0; + for (i = 0; i < nelems; i++) { + /* extract smallest element from heap, i.e. the root */ + srt_off[i] = a[0].off_list[0]; + srt_len[i] = a[0].len_list[0]; + if (bufAddr != NULL) + srt_addr[i] = a[0].addr_list[0]; + a[0].count--; + + if (!a[0].count) { + a[0] = a[heapsize - 1]; + heapsize--; + } else { + a[0].off_list++; + a[0].len_list++; + if (bufAddr != NULL) + a[0].addr_list++; + } + + /* Heapify(a, 0, heapsize); */ + k = 0; + for (;;) { + r = 2 * (k + 1); + l = r - 1; + if (l < heapsize && a[l].off_list[0] < a[k].off_list[0]) + smallest = l; + else + smallest = k; - for (i=0; inprocs; i++) - alloc_size += all_procname_lens[i]; + if (r < heapsize && a[r].off_list[0] < a[smallest].off_list[0]) + smallest = r; - all_procnames[0] = (char *) NCI_Malloc(alloc_size); - if (all_procnames[0] == NULL) { - NCI_Free(all_procname_lens); - NCI_Free(all_procnames); - DEBUG_RETURN_ERROR(NC_ENOMEM) + if (smallest != k) { + SWAP_HEAP(a[k], a[smallest], tmp); + k = smallest; + } else + break; } + } - /* Construct displacement array for the MPI_Gatherv, as each process - * may have a different length for its process name. - */ - disp = (int *) NCI_Malloc(sizeof(int) * ncp->nprocs); - disp[0] = 0; - for (i=1; inprocs; i++) { - all_procnames[i] = all_procnames[i - 1] + all_procname_lens[i - 1]; - disp[i] = disp[i - 1] + all_procname_lens[i - 1]; - } +#ifdef HAVE_MPI_LARGE_COUNT + memcpy(offsets, srt_off, sizeof(MPI_Count) * nelems); + memcpy(blklens, srt_len, sizeof(MPI_Count) * nelems); +#else + memcpy(offsets, srt_off, sizeof(MPI_Offset) * nelems); + memcpy(blklens, srt_len, sizeof(int) * nelems); +#endif + if (bufAddr != NULL) + memcpy(bufAddr, srt_addr, sizeof(MPI_Aint) * nelems); - /* gather all process names */ - MPI_Gatherv(my_procname, my_procname_len, MPI_CHAR, - all_procnames[0], all_procname_lens, disp, MPI_CHAR, - root, ncp->comm); + NCI_Free(a); + if (bufAddr != NULL) NCI_Free(srt_addr); + NCI_Free(srt_len); + NCI_Free(srt_off); +} - NCI_Free(disp); - NCI_Free(all_procname_lens); - } else - /* send process name to root */ - MPI_Gatherv(my_procname, my_procname_len, MPI_CHAR, - NULL, NULL, NULL, MPI_CHAR, root, ncp->comm); - - /* each MPI process's compute node ID */ - node_ids = (int *) NCI_Malloc(sizeof(int) * ncp->nprocs); - - if (ncp->rank == root) { - /* all_procnames[] can tell us the number of nodes and number of - * processes per node. - */ - char **node_names; - int last; - - /* array of pointers pointing to unique host names (compute nodes) */ - node_names = (char **) NCI_Malloc(sizeof(char*) * ncp->nprocs); - - /* number of MPI processes running on each node */ - nprocs_per_node = (int *) NCI_Malloc(sizeof(int) * ncp->nprocs); - - /* calculate nprocs_per_node[] and node_ids[] */ - last = 0; - num_nodes = 0; /* number of unique compute nodes */ - for (i=0; inprocs; i++) { - k = last; - for (j=0; j--------------------------------------------------*/ +/* When intra-node write aggregation is enabled, this subroutine initializes + * the metadata to be used for intra-node communication and I/O requests. + * + * Processes on the same node will first be divided into groups. A process with + * the lowest rank ID in a group is selected as the aggregator. Only the + * aggregators call the MPI-IO functions to perform I/O to the file. Thus, this + * subroutine must be called before MPI_File_open() and should be called only + * once at ncmpio_create() or ncmpio_open(). + * + * The subroutine performs the following tasks. + * 1. Make use of the affinity of each MPI process to its compute node, + * represented by ncp->node_ids.num_nodes and ncp->node_ids.ids[]. Note + * ncp->node_ids should have already been set from a call to + * ncmpii_construct_node_list() earlier during ncmpi_create() and + * ncmpi_open() at the dispatcher. + * + ncp->node_ids.num_nodes is the number of unique compute nodes. + * + ncp->node_ids.ids[ncp->nprocs] contains node IDs for all processes. + * 2. Divide processes into groups, select aggregators, and determine whether + * self process is an intra-node aggregator. + * + ncp->my_aggr is rank ID of my aggregator. + * + if (ncp->my_aggr == ncp->rank) then this rank is an aggregator. + * 3. For an aggregator, find the number of non-aggregators assigned to it and + * construct a list of rank IDs of non-aggregators of its group. + * + ncp->num_nonaggrs is the number of non-aggregators in its group. + * 4. For a non-aggregator, find the rank ID of its assigned aggregator. + * + ncp->my_aggr is rank ID of my aggregator. + * + ncp->nonaggr_ranks[] contains the rank IDs of assigned non-aggregators. + * 5. Create a new MPI communicator consisting of only the aggregators only. + * Obtain the rank ID and total process number of the new communicator. + * + ncp->ina_comm contains the aggregators across all nodes. + * + ncp->ina_nprocs is the number of processes in intra-node communicator. + * + ncp->ina_rank is this process's rank ID in intra-node communicator. + */ +int +ncmpio_ina_init(NC *ncp) +{ + int i, j, mpireturn, do_io, ina_nprocs, naggrs_my_node, first_rank; + int my_rank_index, *ranks_my_node, my_node_id, nprocs_my_node, rem; - for (i=0; iina_time_put) / sizeof(ncp->ina_time_put[0]); + ncp->ina_time_init = ncp->ina_time_flatten = 0.0; + for (i=0; iina_time_put[i] = ncp->ina_time_get[i] = 0; + ncp->maxmem_put[i] = ncp->maxmem_get[i] = 0; } + ncp->ina_npairs_put = ncp->ina_npairs_get = 0; +#endif + + /* initialize parameters of intra-node aggregation */ + ncp->my_aggr = -1; /* rank ID of my aggregator */ + ncp->num_nonaggrs = 0; /* number of non-aggregators assigned */ + ncp->nonaggr_ranks = NULL; /* ranks of assigned non-aggregators */ - MPI_Bcast(node_ids, ncp->nprocs, MPI_INT, root, ncp->comm); + /* Note that ill value of ncp->num_aggrs_per_node has been checked before + * entering this subroutine. Thus ncp->num_aggrs_per_node must be > 0. + */ - /* my_node_id is this rank's node ID */ - my_node_id = node_ids[ncp->rank]; + /* ncp->node_ids.ids[] has been established in ncmpii_construct_node_list() + * called in ncmpio_create() or ncmpio_open() before entering this + * subroutine. my_node_id is this rank's node ID. + */ + my_node_id = ncp->node_ids.ids[ncp->rank]; - /* nprocs_my_node: the number of processes in my nodes + /* nprocs_my_node: the number of processes in my nodes * ranks_my_node[]: rank IDs of all processes in my node. - * my_rank_index points to ranks_my_node[] where - * ranks_my_node[my_rank_index] == ncp->rank + * my_rank_index: points to ranks_my_node[] where + * ranks_my_node[my_rank_index] == ncp->rank */ ranks_my_node = (int*) NCI_Malloc(sizeof(int) * ncp->nprocs); my_rank_index = -1; nprocs_my_node = 0; for (i=0; inprocs; i++) { - if (node_ids[i] == my_node_id) { + if (ncp->node_ids.ids[i] == my_node_id) { if (i == ncp->rank) my_rank_index = nprocs_my_node; ranks_my_node[nprocs_my_node] = i; @@ -339,80 +428,181 @@ ncmpio_intra_node_aggr_init(NC *ncp) } } assert(my_rank_index >= 0); - /* Now, ranks_my_node[my_rank_index] == ncp->rank */ - NCI_Free(node_ids); - - /* make sure number of aggregators in my node <= nprocs_my_node */ + /* Make sure number of aggregators in my node <= nprocs_my_node. In some + * cases, the number of processes allocated to the last few nodes can be + * less than others. + */ naggrs_my_node = MIN(ncp->num_aggrs_per_node, nprocs_my_node); - /* calculate the number of non-aggregators assigned to an aggregator. - * Note num_nonaggrs includes self. + /* For each aggregation group, calculate the number of non-aggregators, + * ncp->num_nonaggrs. Note ncp->num_nonaggrs includes self rank. + */ + ncp->num_nonaggrs = nprocs_my_node / naggrs_my_node; + + /* calculate the number of ranks in each INA group, ncp->num_nonaggrs, + * and the aggregator's rank, first_rank. */ - num_nonaggrs = nprocs_my_node / naggrs_my_node; - if (nprocs_my_node % naggrs_my_node) num_nonaggrs++; - - if (num_nonaggrs == 1) - /* disable aggregation if the number of non-aggregators assigned to - * this aggregator is 1. Note num_nonaggrs includes self. It is - * possible for aggregation enabled or disabled on different nodes and - * even different aggregation groups on the same node. + rem = nprocs_my_node % naggrs_my_node; + if (rem > 0) { /* non-divisible case */ + ncp->num_nonaggrs++; + if (my_rank_index < rem * ncp->num_nonaggrs) + /* first rank of my INA group */ + first_rank = my_rank_index - my_rank_index % ncp->num_nonaggrs; + else { + first_rank = rem * ncp->num_nonaggrs; + ncp->num_nonaggrs--; + first_rank = my_rank_index - (my_rank_index - first_rank) % ncp->num_nonaggrs; + } + } + else /* divisible case */ + first_rank = my_rank_index - my_rank_index % ncp->num_nonaggrs; + + /* Adjust the number of non-aggregators for the last group of each node, + * to make sure it does not go beyond nprocs_my_node. + */ + ncp->num_nonaggrs = MIN(ncp->num_nonaggrs, nprocs_my_node - first_rank); + + /* Assign the first rank as the intra-node aggregator of this group and + * set the rank ID of my aggregator for each process. + */ + ncp->my_aggr = ranks_my_node[first_rank]; + + if (ncp->num_nonaggrs == 1) { + /* When the number of processes in this group is 1, the aggregation + * is not performed. Note num_nonaggrs includes self rank. * - * Use whether ncp->my_aggr < 0 to tell if aggregation is disabled or - * enabled. + * Note this does not mean intra-node aggregation is disabled. The + * indicator of whether intra-node aggregation is enabled or disabled + * is ncp->num_aggrs_per_node, whose value should be consistent across + * all processes. It is possible for some groups containing only one + * process, in which the aggregation is not necessarily performed + * within that group. */ - ncp->my_aggr = -1; - else { - /* find the rank ID of aggregator assigned to this rank */ - ncp->my_aggr = ranks_my_node[my_rank_index - my_rank_index % num_nonaggrs]; + assert(ncp->my_aggr == ncp->rank); + } + else if (ncp->my_aggr == ncp->rank) { /* ncp->num_nonaggrs > 1 */ + /* Construct ncp->nonaggr_ranks[], the rank IDs of non-aggregators of + * this group. Note ncp->nonaggr_ranks[], if malloc-ed, will only be + * freed when closing the file. + */ + ncp->nonaggr_ranks = (int*)NCI_Malloc(sizeof(int) * ncp->num_nonaggrs); + + memcpy(ncp->nonaggr_ranks, ranks_my_node + first_rank, + sizeof(int) * ncp->num_nonaggrs); + } + NCI_Free(ranks_my_node); + + /* Next step is to construct a new MPI communicator consisting of all + * intra-node aggregators. It will later be used to call MPI_File_open(), + * so that only aggregators call MPI-IO functions to access the file. + * + * When using the PnetCDF's internal PNCIO driver, we can pass a list of + * node IDs of the new communicator to the PNCIO file handler, + * ncp->pncio_fh, so to prevent the driver from the repeated work of + * constructing the list of node IDs, node_ids.ids[]. If using MPI-IO + * driver, then ROMIO will do this internally again anyway. + */ - if (ncp->my_aggr == ncp->rank) { /* this rank is an aggregator */ - /* Set the number of non-aggregators assigned to this rank. For the - * last group, make sure it does not go beyond nprocs_my_node. + do_io = (ncp->my_aggr == ncp->rank) ? 1 : 0; + + /* construct an array containing ranks of aggregators */ + ncp->ina_node_list = (int*) NCI_Malloc(sizeof(int) * ncp->nprocs); + TRACE_COMM(MPI_Allgather)(&do_io, 1, MPI_INT, ncp->ina_node_list, 1, + MPI_INT,ncp->comm); + + /* Construct ncp->node_ids.ids[] and ncp->ina_node_list[]. Their contents + * depend on the layout of MPI process allocation to the compute nodes. + * The common layouts can be two kinds: + * + cyclic - MPI ranks are assigned to nodes round-robin-ly, + * + block - MPI ranks are assigned to a node and then move on to next. + * + * Below uses an example of nodes=3, nprocs=10, * num_aggrs_per_node=2. + * ncp->node_ids.ids[] should be + * block process allocation: 0,0,0,0,1,1,1,2,2,2 + * cyclic process allocation: 0,1,2,0,1,2,0,1,2,0 + * Accordingly, ncp->ina_node_list[] can be two kinds + * block process allocation: 1,0,1,0,1,0,1,1,0,1 + * cyclic process allocation: 1,1,1,0,0,0,1,1,1,0 + */ + + /* ncp->node_ids.ids[]: node IDs of processes in the new MPI communicator. + * ncp->ina_node_list[]: the rank IDs of the new MPI communicator. + */ + ina_nprocs = 0; + for (j=0,i=0; inprocs; i++) { + if (ncp->ina_node_list[i]) { + ina_nprocs++; /* count the total number of INA aggregators */ + + ncp->ina_node_list[j] = i; + /* Modify ncp->node_ids.ids[] to store the node IDs of the + * processes in the new communicator. Note ncp->node_ids.ids[] from + * now on is used by PnetCDF's PNCIO driver only. */ - ncp->num_nonaggrs = MIN(num_nonaggrs, nprocs_my_node - my_rank_index); - if (ncp->num_nonaggrs == 1) - /* disable aggregation, as this aggregation group contains only - * self rank - */ - ncp->my_aggr = -1; - else - /* copy the rank IDs over to ncp->nonaggr_ranks[] */ - memcpy(ncp->nonaggr_ranks, - ranks_my_node + my_rank_index, - sizeof(int) * num_nonaggrs); + ncp->node_ids.ids[j] = ncp->node_ids.ids[i]; + j++; } } - NCI_Free(ranks_my_node); - if (ncp->my_aggr < 0) { - /* free ncp->nonaggr_ranks if aggregation is not enabled */ - NCI_Free(ncp->nonaggr_ranks); - ncp->nonaggr_ranks = NULL; + /* Make MPI calls to create a new communicator. */ + MPI_Group origin_group, ina_group; + TRACE_COMM(MPI_Comm_group)(ncp->comm, &origin_group); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, "MPI_Comm_group"); + TRACE_COMM(MPI_Group_incl)(origin_group, ina_nprocs, ncp->ina_node_list, &ina_group); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, "MPI_Group_incl"); + TRACE_COMM(MPI_Comm_create)(ncp->comm, ina_group, &ncp->ina_comm); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, "MPI_Comm_create"); + TRACE_COMM(MPI_Group_free)(&ina_group); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, "MPI_Group_free"); + TRACE_COMM(MPI_Group_free)(&origin_group); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, "MPI_Group_free"); + + /* Non-aggregators will have ncp->ina_comm set to MPI_COMM_NULL */ + if (ncp->ina_comm == MPI_COMM_NULL) { + ncp->ina_nprocs = 0; + ncp->ina_rank = -1; + } + else { + MPI_Comm_size(ncp->ina_comm, &ncp->ina_nprocs); + MPI_Comm_rank(ncp->ina_comm, &ncp->ina_rank); } - /* TODO: For automatically determine Whether to enable intra-node write - * aggregation, this should be done right before each collective write - * call. - * 1. obtain hint cb_noddes, and striping_unit + /* TODO: automatically determine whether or not to enable intra-node + * aggregation. + * + * The ideal case is it can be determined right before each collective + * write call, because only at that time, the communication pattern is + * known. If the pattern can cause contention, then enable it. Otherwise, + * disable it. + * + * Such mechanism may depends on the followings. + * 1. MPI-IO hint cb_noddes, and striping_unit * 2. calculate aggregate access region - * In each round of two-phase I/O, when the number of senders to each - * cb_nodes is very large, then intra-node aggregation should be enabled. - * Average of all nprocs_per_node may be a factor for determining whether - * to enable intra-node aggregation. It indicates whether the high number - * of processes are allocated on the same node. + * 3. If the number of senders to each cb_nodes is very large, then + * intra-node aggregation should be enabled. + * 4. Average of nprocs_per_node across all processes may be a factor for + * determining whether to enable intra-node aggregation. It indicates + * whether the high number of processes are allocated on the same + * node. */ #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) - ncp->aggr_time = MPI_Wtime() - timing; + ncp->ina_time_init = MPI_Wtime() - timing; #endif return NC_NOERR; } /*----< flatten_subarray() >-------------------------------------------------*/ -/* flatten a subarray request into a list of offset-length pairs */ +/* Flatten a subarray request, specified by start[], count[], and stride[] into + * a list of file offset-length pairs, offsets[] and lengths[]. + */ static int flatten_subarray(int ndim, /* number of dimensions */ int el_size, /* array element size */ @@ -426,7 +616,7 @@ flatten_subarray(int ndim, /* number of dimensions */ MPI_Count *offsets, /* OUT: array of offsets */ MPI_Count *lengths /* OUT: array of lengths */ #else - MPI_Aint *offsets, /* OUT: array of offsets */ + MPI_Offset *offsets, /* OUT: array of offsets */ int *lengths /* OUT: array of lengths */ #endif ) @@ -503,12 +693,26 @@ flatten_subarray(int ndim, /* number of dimensions */ subarray_len *= count[ndim]; } + /* check if the list can be coalesced */ + for (i=0, j=1; j<*npairs; j++) { + if (offsets[i] + lengths[i] == offsets[j]) + lengths[i] += lengths[j]; + else { + i++; + if (i < j) { + offsets[i] = offsets[j]; + lengths[i] = lengths[j]; + } + } + } + *npairs = i + 1; + return NC_NOERR; } -/*----< flatten_req() >-----------------------------------------------------*/ -/* flatten one write request into offset-length pairs. - * offsets and lengths are allocated here and need to be freed by the caller +/*----< flatten_req() >------------------------------------------------------*/ +/* Flatten one subarray request into offset-length pairs. Arrays offsets and + * lengths are allocated in this subroutine and need to be freed by the caller. */ static int flatten_req(NC *ncp, @@ -516,19 +720,29 @@ flatten_req(NC *ncp, const MPI_Offset *start, const MPI_Offset *count, const MPI_Offset *stride, + int *is_incr, /* OUT: are offsets incrementing */ MPI_Aint *num_pairs, /* OUT: number of off-len pairs */ #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count **offsets, /* OUT: array of flattened offsets */ - MPI_Count **lengths /* OUT: array of flattened lengths */ + MPI_Count **off_ptr, /* OUT: array of flattened offsets */ + MPI_Count **len_ptr /* OUT: array of flattened lengths */ #else - MPI_Aint **offsets, /* OUT: array of flattened offsets */ - int **lengths /* OUT: array of flattened lengths */ + MPI_Offset **off_ptr, /* OUT: array of flattened offsets */ + int **len_ptr /* OUT: array of flattened lengths */ #endif ) { - int j, err=NC_NOERR, ndims; + int i, j, err=NC_NOERR, ndims; MPI_Aint num, idx; MPI_Offset var_begin, *shape, count0, *ones=NULL; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count prev_end_off; + MPI_Count *offsets; + MPI_Count *lengths; +#else + MPI_Offset prev_end_off; + MPI_Offset *offsets; + int *lengths; +#endif *num_pairs = 0; /* total number of offset-length pairs */ @@ -537,15 +751,17 @@ flatten_req(NC *ncp, */ if (varp->ndims == 0) { /* scalar variable */ #ifdef HAVE_MPI_LARGE_COUNT - *offsets = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count)); - *lengths = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count)); + offsets = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * 2); + lengths = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * 2); #else - *offsets = (MPI_Aint*)NCI_Malloc(sizeof(MPI_Aint)); - *lengths = (int*) NCI_Malloc(sizeof(int)); + offsets = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * 2); + lengths = (int*) NCI_Malloc(sizeof(int) * 2); #endif - (*offsets)[0] = varp->begin; - (*lengths)[0] = varp->xsz; + offsets[0] = varp->begin; + lengths[0] = varp->xsz; *num_pairs = 1; + *off_ptr = offsets; + *len_ptr = lengths; return NC_NOERR; } else if (varp->ndims == 1 && IS_RECVAR(varp)) { /* scalar variable */ @@ -555,22 +771,24 @@ flatten_req(NC *ncp, num = 1; if (stride != NULL && stride[varp->ndims-1] > 1) num = count[varp->ndims-1]; /* count of last dimension */ - for (j=0; jndims-1; j++) - num *= count[j]; /* all count[] except the last dimension */ + for (i=0; indims-1; i++) + num *= count[i]; /* all count[] except the last dimension */ } *num_pairs = num; #ifdef HAVE_MPI_LARGE_COUNT - *offsets = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * num); - *lengths = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * num); + offsets = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * (num+1)); + lengths = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * (num+1)); #else - *offsets = (MPI_Aint*)NCI_Malloc(sizeof(MPI_Aint) * num); - *lengths = (int*) NCI_Malloc(sizeof(int) * num); + offsets = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * (num+1)); + lengths = (int*) NCI_Malloc(sizeof(int) * (num+1)); #endif + *off_ptr = offsets; + *len_ptr = lengths; if (stride == NULL) { /* equivalent to {1, 1, ..., 1} */ ones = (MPI_Offset*) NCI_Malloc(sizeof(MPI_Offset) * varp->ndims); - for (j=0; jndims; j++) ones[j] = 1; + for (i=0; indims; i++) ones[i] = 1; } ndims = varp->ndims; @@ -589,13 +807,26 @@ flatten_req(NC *ncp, count0 = 1; idx = 0; - for (j=0; jxsz, var_begin, shape, start, count, (stride == NULL) ? ones : stride, - &num, /* OUT: num of off-len pairs */ - *offsets + idx, /* OUT: array of offsets */ - *lengths + idx); /* OUT: array of lengths */ + &num, /* OUT: num of off-len pairs */ + offsets + idx, /* OUT: array of offsets */ + lengths + idx); /* OUT: array of lengths */ + + if (num == 0) continue; + + /* check if offsets[] are in an increasing order */ + for (j=0; j offsets[idx+j]) + *is_incr = 0; /* offsets are not incrementing */ + else + prev_end_off = offsets[idx+j]; + } + idx += num; assert(idx <= *num_pairs); @@ -605,30 +836,46 @@ flatten_req(NC *ncp, if (ones != NULL) NCI_Free(ones); + /* num_pairs may be less than originally calculated, because offset-length + * pairs are coalesced in the call to flatten_subarray(). + */ + *num_pairs = idx; + return err; } /*----< flatten_reqs() >-----------------------------------------------------*/ -/* flatten all write requests into offset-length pairs. - * offsets and lengths are allocated here and need to be freed by the caller +/* Flatten multiple subarray requests into file offset-length pairs. Arrays + * offsets and lengths are allocated here and need to be freed by the caller. */ static int flatten_reqs(NC *ncp, + int reqMode, /* IN: NC_REQ_RD or NC_REQ_WR */ int num_reqs, /* IN: # requests */ const NC_req *reqs, /* [num_reqs] requests */ + int *is_incr, /* OUT: are offsets incrementing */ MPI_Aint *num_pairs, /* OUT: total number of off-len pairs */ #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count **offsets, /* OUT: array of flattened offsets */ - MPI_Count **lengths /* OUT: array of flattened lengths */ + MPI_Count **off_ptr, /* OUT: array of flattened offsets */ + MPI_Count **len_ptr /* OUT: array of flattened lengths */ #else - MPI_Aint **offsets, /* OUT: array of flattened offsets */ - int **lengths /* OUT: array of flattened lengths */ + MPI_Offset **off_ptr, /* OUT: array of flattened offsets */ + int **len_ptr /* OUT: array of flattened lengths */ #endif ) { int i, j, status=NC_NOERR, ndims, max_ndims=0; MPI_Aint num, idx; MPI_Offset *start, *count, *shape, *stride, *ones; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count prev_end_off; + MPI_Count *offsets; + MPI_Count *lengths; +#else + MPI_Offset prev_end_off; + MPI_Offset *offsets; + int *lengths; +#endif *num_pairs = 0; /* total number of offset-length pairs */ @@ -636,57 +883,60 @@ flatten_reqs(NC *ncp, * contiguous memory space for storing off-len pairs */ for (i=0; iput_lead_list + reqs[i].lead_off; - ndims = lead->varp->ndims; - max_ndims = MAX(max_ndims, ndims); - if (ndims > 0) { - start = reqs[i].start; - count = start + ndims; - stride = count + ndims; - } + /* reqs[i].npairs is the number of offset-length pairs of this request, + * calculated in ncmpio_igetput_varm() and igetput_varn() + */ + *num_pairs += reqs[i].npairs; + if (fIsSet(reqMode, NC_REQ_WR)) + ndims = ncp->put_lead_list[reqs[i].lead_off].varp->ndims; else - start = count = stride = NULL; - - /* for record variable, each reqs[] is within a record */ - if (IS_RECVAR(lead->varp)) { - ndims--; - start++; - count++; - stride++; - } - if (fIsSet(lead->flag, NC_REQ_STRIDE_NULL)) stride = NULL; - - if (ndims < 0) continue; - if (ndims == 0) { /* 1D record variable */ - (*num_pairs)++; - continue; - } - num = 1; - if (stride != NULL && stride[ndims-1] > 1) - num = count[ndims-1]; /* count of last dimension */ - for (j=0; jget_lead_list[reqs[i].lead_off].varp->ndims; + max_ndims = MAX(max_ndims, ndims); } /* now we can allocate a contiguous memory space for the off-len pairs */ #ifdef HAVE_MPI_LARGE_COUNT - *offsets = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * (*num_pairs)); - *lengths = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * (*num_pairs)); + offsets = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * (*num_pairs+1)); + lengths = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * (*num_pairs+1)); #else - *offsets = (MPI_Aint*)NCI_Malloc(sizeof(MPI_Aint) * (*num_pairs)); - *lengths = (int*) NCI_Malloc(sizeof(int) * (*num_pairs)); + offsets = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * (*num_pairs+1)); + lengths = (int*) NCI_Malloc(sizeof(int) * (*num_pairs+1)); #endif - idx = 0; + *off_ptr = offsets; + *len_ptr = lengths; ones = (MPI_Offset*) NCI_Malloc(sizeof(MPI_Offset) * max_ndims); for (i=0; iput_lead_list + reqs[i].lead_off; + NC_lead_req *lead; + if (fIsSet(reqMode, NC_REQ_WR)) + lead = ncp->put_lead_list + reqs[i].lead_off; + else + lead = ncp->get_lead_list + reqs[i].lead_off; + + if (reqs[i].npairs == 1) { + /* When reqs[i] contains only one offset-length pair, re-use + * reqs[i].offset_start, which has been generated earlier at a call + * to ncmpio_intra_node_aggregation_nreqs(). + */ + offsets[idx] = reqs[i].offset_start; + lengths[idx] = reqs[i].nelems * lead->varp->xsz; + + /* check if offsets[] are in an increasing order */ + if (prev_end_off > offsets[idx]) + *is_incr = 0; /* offsets are not incrementing */ + else + prev_end_off = offsets[idx]; + idx++; + continue; + } ndims = lead->varp->ndims; if (ndims > 0) { @@ -715,20 +965,37 @@ flatten_reqs(NC *ncp, if (fIsSet(lead->flag, NC_REQ_STRIDE_NULL)) stride = NULL; - /* flatten each request into a list of offset-length pairs and - * append to the end of offsets and lengths + /* flatten each request into a list of offset-length pairs and append + * to the end of offsets and lengths */ flatten_subarray(ndims, lead->varp->xsz, var_begin, shape, start, count, (stride == NULL) ? ones : stride, - &num, /* OUT: number of off-len pairs */ - *offsets + idx, /* OUT: array of offsets */ - *lengths + idx); /* OUT: array of lengths */ + &num, /* OUT: number of off-len pairs */ + offsets + idx, /* OUT: array of offsets */ + lengths + idx); /* OUT: array of lengths */ + + /* check if offsets[] are in an increasing order */ + for (j=0; j offsets[idx+j]) + *is_incr = 0; /* offsets are not incrementing */ + else + prev_end_off = offsets[idx+j]; + } idx += num; } NCI_Free(ones); + /* num_pairs may be less than originally calculated, because offset-length + * pairs are coalesced in the call to flatten_subarray(). + */ + *num_pairs = idx; + for (i=0; iput_lead_list + reqs[i].lead_off; + NC_lead_req *lead; + if (fIsSet(reqMode, NC_REQ_WR)) + lead = ncp->put_lead_list + reqs[i].lead_off; + else + lead = ncp->get_lead_list + reqs[i].lead_off; if (fIsSet(lead->flag, NC_REQ_TO_FREE)) { NCI_Free(lead->start); lead->start = NULL; @@ -738,187 +1005,434 @@ flatten_reqs(NC *ncp, return status; } -/*----< construct_buf_type() >-----------------------------------------------*/ -/* construct an MPI derived datatype for I/O buffers from the request list, by - * concatenate all buffers. +/*----< flat_buf_type() >----------------------------------------------------*/ +/* Scan the nonblocking requests, pointed by reqs, and build the offset-length + * pairs of all buffers, xbuf. Note xbuf in each nonblocking request is a + * contiguous buffer (packed from the user buffer for the write operations). + * For record variables, if a user request is accessing more than one record, + * the request is split into into multiple NC_req objects, one for each record. */ static int -construct_buf_type(const NC *ncp, - int num_reqs, /* IN: # requests */ - const NC_req *reqs, /* [num_reqs] requests */ - MPI_Aint *bufLen, /* OUT: buffer size in bytes */ - MPI_Datatype *bufType) /* OUT: buffer datatype */ +flat_buf_type(const NC *ncp, + int reqMode, /* IN: NC_REQ_RD or NC_REQ_WR */ + int num_reqs, /* IN: # requests */ + const NC_req *reqs, /* IN: [num_reqs] requests */ + PNCIO_View *buf_view, /* OUT: flattened buftype */ + void **buf) /* OUT: pointer to I/O buffer */ +/* TODO: */ +#if 1 { - int i, err, mpireturn, status=NC_NOERR; + int i, j, err=NC_NOERR; NC_lead_req *lead; + MPI_Aint addr, addr0; +/* buffer offset should be of type MPI_Aint. length should be size_t. */ + + buf_view->type = MPI_BYTE; + buf_view->size = 0; + buf_view->count = 0; + buf_view->off = NULL; + buf_view->len = NULL; + buf_view->is_contig = 1; + + if (num_reqs == 0) + return NC_NOERR; + buf_view->off = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * num_reqs); #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count *blocklens = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * num_reqs); - MPI_Count *disps = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * num_reqs); + buf_view->len = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * num_reqs); #else - int *blocklens = (int*) NCI_Malloc(sizeof(int) * num_reqs); - MPI_Aint *disps = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * num_reqs); + buf_view->len = (int*) NCI_Malloc(sizeof(int) * num_reqs); #endif - *bufLen = 0; - for (i=0; iput_lead_list + : ncp->get_lead_list; + + MPI_Get_address(lead[reqs[0].lead_off].xbuf, &addr0); +// printf("%s at %d: lead xbuf=%ld nelems=%lld\n",__func__,__LINE__, addr0,lead[reqs[0].lead_off].nelems); +// assert(reqs[0].xbuf == lead[reqs[0].lead_off].xbuf); + + /* set buf_view->off[0] and buf_view->len[0] */ + MPI_Get_address(reqs[0].xbuf, &addr0); /* displacement uses MPI_BOTTOM */ + buf_view->off[0] = 0; + + /* buf_view->len[] are in bytes */ + buf_view->len[0] = reqs[0].nelems * lead[reqs[0].lead_off].varp->xsz; +#if 0 +printf("%s at %d: buf_view->len[0]=%lld nelems=%lld\n",__func__,__LINE__, buf_view->len[0],reqs[0].nelems); +j=0; +printf("%s at %d: buf_view xbuf=%ld off[%d]=%lld nelems=%lld\n",__func__,__LINE__, addr0,j,buf_view->off[j],reqs[0].nelems); +#endif + + +/* +int *wkl, nelems; char *xbuf; +j = 0; +wkl = (int*) malloc(buf_view->len[j]); +nelems=buf_view->len[j]/4; xbuf = (char*)reqs[j].xbuf + buf_view->off[j]; +memcpy(wkl, xbuf, nelems*4); ncmpii_in_swapn(wkl, nelems, 4); +printf("%s at %d: nelems=%d off=%lld buf=(%p) ",__func__,__LINE__, nelems, buf_view->off[j], xbuf); +for (i=0; isize = buf_view->len[0]; + for (i=0, j=1; joff[j] = addr - addr0; - /* blocklens[] in bytes */ - lead = ncp->put_lead_list + reqs[i].lead_off; - blocklens[i] = reqs[i].nelems * lead->varp->xsz; +// printf("%s at %d: buf_view xbuf=%ld off[%d]=%lld nelems=%lld\n",__func__,__LINE__, addr,j,buf_view->off[j],reqs[j].nelems); - *bufLen += blocklens[i]; - } +// assert(reqs[j].xbuf == lead[reqs[j].lead_off].xbuf); + /* buf_view->len[] are in bytes */ + buf_view->len[j] = reqs[j].nelems * lead[reqs[j].lead_off].varp->xsz; - /* construct buffer derived datatype */ -#ifdef HAVE_MPI_LARGE_COUNT - mpireturn = MPI_Type_create_hindexed_c(num_reqs, blocklens, disps, - MPI_BYTE, bufType); +/* +wkl = (int*) malloc(buf_view->len[j]); +nelems=buf_view->len[j]/4; +xbuf = (char*)reqs[j].xbuf; // + buf_view->off[j]; +xbuf = (char*)(*buf) + buf_view->off[j]; +memcpy(wkl, xbuf, nelems*4); ncmpii_in_swapn(wkl, nelems, 4); +printf("%s at %d: nelems=%d off=%lld buf=(%p) ",__func__,__LINE__, nelems, buf_view->off[j], xbuf); +for (i=0; isize += buf_view->len[j]; + + /* coalesce the off-len pairs */ + if (buf_view->off[i] + buf_view->len[i] == buf_view->off[j]) + buf_view->len[i] += buf_view->len[j]; + else { + i++; + if (i < j) { + buf_view->off[i] = buf_view->off[j]; + buf_view->len[i] = buf_view->len[j]; + } + } + } + /* After coalescing, the true number of requests may be reduced */ +// printf("%s at %d: buf_view->size=%lld\n",__func__,__LINE__, buf_view->size); #else - mpireturn = MPI_Type_create_hindexed(num_reqs, blocklens, disps, - MPI_BYTE, bufType); -#endif - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_hindexed"); - /* return the first encountered error if there is any */ - if (status == NC_NOERR) status = err; + /* set buf_view->off[0] and buf_view->len[0] */ + MPI_Get_address(reqs[0].xbuf, &addr); /* displacement uses MPI_BOTTOM */ + buf_view->off[0] = addr; + + lead = (fIsSet(reqMode, NC_REQ_WR)) ? ncp->put_lead_list + : ncp->get_lead_list; - *bufType = MPI_DATATYPE_NULL; + /* buf_view->len[] are in bytes */ + buf_view->len[0] = reqs[0].nelems * lead[reqs[0].lead_off].varp->xsz; + ? *buf = lead[reqs[0].lead_off].xbuf; + + buf_view->size = buf_view->len[0]; + for (i=0, j=1; joff[j] = addr; + + /* buf_view->len[] are in bytes */ + buf_view->len[j] = reqs[j].nelems * lead[reqs[j].lead_off].varp->xsz; + + /* accumulate buffer type size */ + buf_view->size += buf_view->len[j]; + + /* coalesce the off-len pairs */ + if (buf_view->off[i] + buf_view->len[i] == buf_view->off[j]) + buf_view->len[i] += buf_view->len[j]; + else { + i++; + if (i < j) { + buf_view->off[i] = buf_view->off[j]; + buf_view->len[i] = buf_view->len[j]; + } + } } - else { - MPI_Type_commit(bufType); + /* After coalescing, the true number of requests may be reduced */ +#endif + + if (i + 1 < num_reqs) { + num_reqs = i + 1; /* num_reqs is reduced */ + buf_view->off = (MPI_Offset*)NCI_Realloc(buf_view->off, + sizeof(MPI_Offset) * num_reqs); #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count typeSize; - MPI_Type_size_c(*bufType, &typeSize); + buf_view->len = (MPI_Offset*)NCI_Realloc(buf_view->len, + sizeof(MPI_Offset) * num_reqs); #else - int typeSize; - MPI_Type_size(*bufType, &typeSize); + buf_view->len = (int*) NCI_Realloc(buf_view->len, + sizeof(int) * num_reqs); #endif - assert(typeSize == *bufLen); } - NCI_Free(blocklens); - NCI_Free(disps); - - return status; -} + buf_view->count = num_reqs; + buf_view->is_contig = (num_reqs <= 1); -/*----< intra_node_aggregation() >-------------------------------------------*/ -/* This is a collective call */ -static int -intra_node_aggregation(NC *ncp, - MPI_Aint num_pairs, + /* construct buf_view->type if it is noncontiguous */ + if (num_reqs > 1) { + int mpireturn; #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count *offsets, - MPI_Count *lengths, + mpireturn = MPI_Type_create_hindexed_c(num_reqs, buf_view->len, + buf_view->off, MPI_BYTE, + &buf_view->type); #else - MPI_Aint *offsets, - int *lengths, + MPI_Aint *disp; +#if SIZEOF_MPI_AINT == SIZEOF_MPI_OFFSET + disp = (MPI_Aint*) buf_view->off; +#else + disp = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * num_reqs); + for (j=0; joff[j]; +#endif + + mpireturn = MPI_Type_create_hindexed(num_reqs, buf_view->len, disp, + MPI_BYTE, &buf_view->type); +#if SIZEOF_MPI_AINT != SIZEOF_MPI_OFFSET + NCI_Free(disp); #endif - MPI_Offset bufCount, - MPI_Datatype bufType, - void *buf) +#endif + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_hindexed"); + + buf_view->type = MPI_BYTE; + NCI_Free(buf_view->off); + NCI_Free(buf_view->len); + buf_view->off = NULL; + buf_view->len = NULL; + buf_view->count = 0; + buf_view->size = 0; + } + else { + MPI_Type_commit(&buf_view->type); + } + } + + return err; +} +#else { - int i, j, err, mpireturn, status=NC_NOERR, nreqs; - char *recv_buf=NULL, *wr_buf = NULL; - MPI_Aint npairs=0, *msg; - MPI_Offset offset=0, buf_count; - MPI_Datatype recvTypes, fileType=MPI_BYTE; - MPI_File fh; - MPI_Request *req=NULL; -#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) - double timing = MPI_Wtime(); + int i, j, err, mpireturn, status=NC_NOERR; + NC_lead_req *lead; + MPI_Aint addr; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *disps, *blens; +#else + MPI_Aint *disps; + int *blens; #endif + + if (num_reqs == 0) { + buf_view->type = MPI_BYTE; + buf_view->count = 0; + return NC_NOERR; + } + #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count bufLen; - MPI_Type_size_c(bufType, &bufLen); + disps = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * num_reqs); + blens = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * num_reqs); #else - int bufLen; - MPI_Type_size(bufType, &bufLen); + disps = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * num_reqs); + blens = (int*) NCI_Malloc(sizeof(int) * num_reqs); #endif - bufLen *= bufCount; - /* First, tell aggregator how much to receive by sending: - * (num_pairs and bufLen). The message size to be sent by this rank - * is num_pairs * 2 * sizeof(MPI_Offset) + bufLen - */ - if (ncp->rank == ncp->my_aggr) - msg = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * ncp->num_nonaggrs * 2); - else - msg = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * 2); + /* set disps[0] and blens[0] */ + MPI_Get_address(reqs[0].xbuf, &addr); /* displacement uses MPI_BOTTOM */ + disps[0] = addr; - msg[0] = num_pairs; - msg[1] = bufLen; + lead = (fIsSet(reqMode, NC_REQ_WR)) ? ncp->put_lead_list + : ncp->get_lead_list; - /* Aggregator collects each non-aggregator's num_pairs and bufLen */ - if (ncp->rank == ncp->my_aggr) { - req = (MPI_Request*)NCI_Malloc(sizeof(MPI_Request) * ncp->num_nonaggrs); - nreqs = 0; - for (i=1; inum_nonaggrs; i++) - MPI_Irecv(msg + i*2, 2, MPI_AINT, ncp->nonaggr_ranks[i], 0, - ncp->comm, &req[nreqs++]); + /* blens[] are in bytes */ + blens[0] = reqs[0].nelems * lead[reqs[0].lead_off].varp->xsz; + *buf = lead[reqs[0].lead_off].xbuf; - mpireturn = MPI_Waitall(nreqs, req, MPI_STATUSES_IGNORE); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall"); - /* return the first encountered error if there is any */ - if (status == NC_NOERR) status = err; + for (i=0, j=1; jxsz; + + /* coalesce the disps-blens pairs */ + if (disps[i] + blens[i] == disps[j]) + blens[i] += blens[j]; + else { + i++; + if (i < j) { + disps[i] = disps[j]; + blens[i] = blens[j]; + } } } - else { /* non-aggregator */ - MPI_Send(msg, 2, MPI_AINT, ncp->my_aggr, 0, ncp->comm); - if (num_pairs == 0) - NCI_Free(msg); + + if (i + 1 < num_reqs) { + num_reqs = i + 1; +#ifdef HAVE_MPI_LARGE_COUNT + disps = (MPI_Count*)NCI_Realloc(disps, sizeof(MPI_Count) * num_reqs); + blens = (MPI_Count*)NCI_Realloc(blens, sizeof(MPI_Count) * num_reqs); +#else + disps = (MPI_Aint*) NCI_Realloc(disps, sizeof(MPI_Aint) * num_reqs); + blens = (int*) NCI_Realloc(blens, sizeof(int) * num_reqs); +#endif } - /* Aggregator collects offset-length pairs from non-aggregators */ - if (ncp->rank == ncp->my_aggr) { - /* calculate the total number of offset-length pairs */ - npairs = num_pairs; - for (i=1; inum_nonaggrs; i++) npairs += msg[i*2]; + buf_view->count = num_reqs; + buf_view->off = disps; + buf_view->len = blens; +/* TODO: below datatype construction moves into ncmpio_read_write() */ + if (num_reqs == 1) { +#if 1 +buf_view->count = blens[0]; +#endif + buf_view->type = MPI_BYTE; + } + else { +#if 1 + /* construct buffer derived datatype */ #ifdef HAVE_MPI_LARGE_COUNT - if (npairs > num_pairs) { - /* realloc to store all pairs in a contiguous buffer */ - offsets = (MPI_Count*) NCI_Realloc(offsets, sizeof(MPI_Count) * npairs); - lengths = (MPI_Count*) NCI_Realloc(lengths, sizeof(MPI_Count) * npairs); - } + mpireturn = MPI_Type_create_hindexed_c(num_reqs, blens, disps, + MPI_BYTE, &buf_view->type); #else - if (npairs > num_pairs) { - /* realloc to store all pairs in a contiguous buffer */ - offsets = (MPI_Aint*) NCI_Realloc(offsets, sizeof(MPI_Aint) * npairs); - lengths = (int*) NCI_Realloc(lengths, sizeof(int) * npairs); + mpireturn = MPI_Type_create_hindexed(num_reqs, blens, disps, + MPI_BYTE, &buf_view->type); +#endif + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_hindexed"); + /* return the first encountered error if there is any */ + if (status == NC_NOERR) status = err; + + buf_view->type = MPI_BYTE; + buf_view->count = 0; + } + else { + MPI_Type_commit(&buf_view->type); +buf_view->count = 1; } #endif + *buf = NULL; /* buf_view->type is constructed using MPI_BOTTOM */ + } - nreqs = 0; -#ifdef HAVE_MPI_LARGE_COUNT - MPI_Aint aint; +#if 1 + NCI_Free(blens); + NCI_Free(disps); +#endif + return status; +} +#endif + +/*----< ina_collect_md() >---------------------------------------------------*/ +/* Within each intra-node aggregation group, the aggregator collects request + * metadata from the non-aggregators into meta, including: + * 1. the number of offset-length pairs on each non-aggregator + * 2. offsets array of each non-aggregator + * 3. lengths array of each non-aggregator + * 4. npairs is the total number of offset-length pairs of this group. + */ +static +int ina_collect_md(NC *ncp, + MPI_Aint *meta, +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count **offsets, /* OUT: may be realloc-ed */ + MPI_Count **lengths, /* OUT: may be realloc-ed */ +#else + MPI_Offset **offsets, /* OUT: may be realloc-ed */ + int **lengths, /* OUT: may be realloc-ed */ +#endif + MPI_Aint *npairs) /* OUT: total no. off-len pairs */ +{ + int i, err, mpireturn, status=NC_NOERR, nreqs; + MPI_Request *req=NULL; + MPI_Aint num_pairs=meta[0]; + + /* Aggregator collects each non-aggregator's num_pairs and bufLen */ + if (ncp->my_aggr == ncp->rank) { + + req = (MPI_Request*)NCI_Malloc(sizeof(MPI_Request) * ncp->num_nonaggrs); + nreqs = 0; + for (i=1; inum_nonaggrs; i++) + TRACE_COMM(MPI_Irecv)(meta + i*3, 3, MPI_AINT, + ncp->nonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]); + + if (nreqs > 0) { +#ifdef HAVE_MPI_STATUSES_IGNORE + TRACE_COMM(MPI_Waitall)(nreqs, req, MPI_STATUSES_IGNORE); +#else + MPI_Status *statuses = (MPI_Status *) + NCI_Malloc(nreqs * sizeof(MPI_Status)); + TRACE_COMM(MPI_Waitall)(nreqs, req, statuses); + NCI_Free(statuses); +#endif + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall"); + /* return the first encountered error if there is any */ + if (status == NC_NOERR) status = err; + } + } + } + else /* non-aggregator */ + TRACE_COMM(MPI_Send)(meta, 3, MPI_AINT, ncp->my_aggr, 0, ncp->comm); + + /* Secondly, aggregators collect offset-length pairs from all its + * non-aggregators + */ + if (ncp->my_aggr == ncp->rank) { + MPI_Datatype recvType; + + /* calculate the total number of offset-length pairs to receive */ + for (*npairs=0, i=0; inum_nonaggrs; i++) *npairs += meta[i*3]; + + /* offsets and lengths have been allocated for storing this rank's + * offsets and lengths, realloc them to receive offsets and lengths + * from non-aggregators so they can be in a contiguous buffer. + */ +#ifdef HAVE_MPI_LARGE_COUNT + if (*npairs > num_pairs) { + *offsets = (MPI_Count*) NCI_Realloc(*offsets, *npairs * sizeof(MPI_Count)); + *lengths = (MPI_Count*) NCI_Realloc(*lengths, *npairs * sizeof(MPI_Count)); + } +#else + if (*npairs > num_pairs) { + /* realloc to store all pairs in a contiguous buffer */ + *offsets = (MPI_Offset*) NCI_Realloc(*offsets, *npairs * sizeof(MPI_Offset)); + *lengths = (int*) NCI_Realloc(*lengths, *npairs * sizeof(int)); + } +#endif + + /* To minimize number of MPI recv calls per non-aggregator, below + * creates a derived datatype, recvType, to combine offsets and lengths + * into one MPI_Irecv call. + */ + nreqs = 0; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Aint aint; MPI_Count bklens[2]; MPI_Count disps[2]; - MPI_Get_address(offsets, &aint); - disps[0] = MPI_Aint_add(aint, sizeof(MPI_Count) * msg[0]); - MPI_Get_address(lengths, &aint); - disps[1] = MPI_Aint_add(aint, sizeof(MPI_Count) * msg[0]); + MPI_Get_address(*offsets, &aint); + disps[0] = MPI_Aint_add(aint, sizeof(MPI_Count) * meta[0]); + MPI_Get_address(*lengths, &aint); + disps[1] = MPI_Aint_add(aint, sizeof(MPI_Count) * meta[0]); for (i=1; inum_nonaggrs; i++) { - if (msg[i*2] == 0) continue; - bklens[0] = msg[i*2] * sizeof(MPI_Count); - bklens[1] = msg[i*2] * sizeof(MPI_Count); + if (meta[i*3] == 0) continue; + bklens[0] = meta[i*3] * sizeof(MPI_Count); + bklens[1] = meta[i*3] * sizeof(MPI_Count); mpireturn = MPI_Type_create_hindexed_c(2, bklens, disps, MPI_BYTE, - &recvTypes); + &recvType); if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed_c"); /* return the first encountered error if there is any */ if (status == NC_NOERR) status = err; } else { - mpireturn = MPI_Type_commit(&recvTypes); + mpireturn = MPI_Type_commit(&recvType); if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_commit"); /* return the first encountered error if there is any */ @@ -926,35 +1440,34 @@ intra_node_aggregation(NC *ncp, } } /* post to receive offset-length pairs from non-aggregators */ - MPI_Irecv_c(MPI_BOTTOM, 1, recvTypes, ncp->nonaggr_ranks[i], - 0, ncp->comm, &req[nreqs]); - MPI_Type_free(&recvTypes); + TRACE_COMM(MPI_Irecv_c)(MPI_BOTTOM, 1, recvType, + ncp->nonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]); + MPI_Type_free(&recvType); disps[0] = MPI_Aint_add(disps[0], bklens[0]); disps[1] = MPI_Aint_add(disps[1], bklens[1]); - nreqs++; } #else int bklens[2]; MPI_Aint aint, disps[2]; - MPI_Get_address(offsets, &aint); - disps[0] = MPI_Aint_add(aint, sizeof(MPI_Aint) * msg[0]); - MPI_Get_address(lengths, &aint); - disps[1] = MPI_Aint_add(aint, sizeof(int) * msg[0]); + MPI_Get_address(*offsets, &aint); + disps[0] = MPI_Aint_add(aint, sizeof(MPI_Offset) * meta[0]); + MPI_Get_address(*lengths, &aint); + disps[1] = MPI_Aint_add(aint, sizeof(int) * meta[0]); for (i=1; inum_nonaggrs; i++) { - if (msg[i*2] == 0) continue; - bklens[0] = msg[i*2] * sizeof(MPI_Aint); - bklens[1] = msg[i*2] * sizeof(int); + if (meta[i*3] == 0) continue; + bklens[0] = meta[i*3] * sizeof(MPI_Offset); + bklens[1] = meta[i*3] * sizeof(int); mpireturn = MPI_Type_create_hindexed(2, bklens, disps, MPI_BYTE, - &recvTypes); + &recvType); if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed"); /* return the first encountered error if there is any */ if (status == NC_NOERR) status = err; } else { - mpireturn = MPI_Type_commit(&recvTypes); + mpireturn = MPI_Type_commit(&recvType); if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_commit"); /* return the first encountered error if there is any */ @@ -962,368 +1475,1430 @@ intra_node_aggregation(NC *ncp, } } /* post to receive offset-length pairs from non-aggregators */ - MPI_Irecv(MPI_BOTTOM, 1, recvTypes, ncp->nonaggr_ranks[i], - 0, ncp->comm, &req[nreqs]); - MPI_Type_free(&recvTypes); + TRACE_COMM(MPI_Irecv)(MPI_BOTTOM, 1, recvType, + ncp->nonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]); + MPI_Type_free(&recvType); disps[0] = MPI_Aint_add(disps[0], bklens[0]); disps[1] = MPI_Aint_add(disps[1], bklens[1]); - nreqs++; } #endif - mpireturn = MPI_Waitall(nreqs, req, MPI_STATUSES_IGNORE); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall"); - /* return the first encountered error if there is any */ - if (status == NC_NOERR) status = err; + if (nreqs > 0) { +#ifdef HAVE_MPI_STATUSES_IGNORE + TRACE_COMM(MPI_Waitall)(nreqs, req, MPI_STATUSES_IGNORE); +#else + MPI_Status *statuses = (MPI_Status *) + NCI_Malloc(nreqs * sizeof(MPI_Status)); + TRACE_COMM(MPI_Waitall)(nreqs, req, statuses); + NCI_Free(statuses); +#endif + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall"); + /* return the first encountered error if there is any */ + if (status == NC_NOERR) status = err; + } } + NCI_Free(req); } else if (num_pairs > 0) { /* non-aggregator */ - /* send offset-length pairs data to the aggregator */ + /* To minimize number of MPI send calls to the aggregator, below + * creates a derived datatype, sendType, to combine offsets and lengths + * into one MPI_Send call. + */ + MPI_Datatype sendType; + #ifdef HAVE_MPI_LARGE_COUNT MPI_Aint aint; MPI_Count bklens[2]; MPI_Count disps[2]; - bklens[0] = msg[0] * sizeof(MPI_Count); + bklens[0] = meta[0] * sizeof(MPI_Count); bklens[1] = bklens[0]; - MPI_Get_address(offsets, &aint); + MPI_Get_address(*offsets, &aint); disps[0] = aint; - MPI_Get_address(lengths, &aint); + MPI_Get_address(*lengths, &aint); disps[1] = aint; mpireturn = MPI_Type_create_hindexed_c(2, bklens, disps, MPI_BYTE, - &recvTypes); + &sendType); if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed_c"); /* return the first encountered error if there is any */ if (status == NC_NOERR) status = err; } else { - mpireturn = MPI_Type_commit(&recvTypes); + mpireturn = MPI_Type_commit(&sendType); if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_commit"); /* return the first encountered error if there is any */ if (status == NC_NOERR) status = err; } } - MPI_Send_c(MPI_BOTTOM, 1, recvTypes, ncp->my_aggr, 0, ncp->comm); - MPI_Type_free(&recvTypes); + TRACE_COMM(MPI_Send_c)(MPI_BOTTOM, 1, sendType, ncp->my_aggr, 0, + ncp->comm); + MPI_Type_free(&sendType); #else int bklens[2]; MPI_Aint disps[2]; - bklens[0] = msg[0] * sizeof(MPI_Aint); - bklens[1] = msg[0] * sizeof(int); - MPI_Get_address(offsets, &disps[0]); - MPI_Get_address(lengths, &disps[1]); + bklens[0] = meta[0] * sizeof(MPI_Aint); + bklens[1] = meta[0] * sizeof(int); + MPI_Get_address(*offsets, &disps[0]); + MPI_Get_address(*lengths, &disps[1]); mpireturn = MPI_Type_create_hindexed(2, bklens, disps, MPI_BYTE, - &recvTypes); + &sendType); if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed"); /* return the first encountered error if there is any */ if (status == NC_NOERR) status = err; } else { - mpireturn = MPI_Type_commit(&recvTypes); + mpireturn = MPI_Type_commit(&sendType); if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_commit"); /* return the first encountered error if there is any */ if (status == NC_NOERR) status = err; } } - MPI_Send(MPI_BOTTOM, 1, recvTypes, ncp->my_aggr, 0, ncp->comm); - MPI_Type_free(&recvTypes); + TRACE_COMM(MPI_Send)(MPI_BOTTOM, 1, sendType, ncp->my_aggr, 0, + ncp->comm); + MPI_Type_free(&sendType); #endif - NCI_Free(msg); } - /* - * TODO, define a datatype to combine sends of offset-length pairs with the - * write data into a single send call. - */ - nreqs = 0; - if (ncp->rank == ncp->my_aggr) { - /* calculate the total write account */ - buf_count = bufLen; - for (i=1; inum_nonaggrs; i++) buf_count += msg[i*2 + 1]; - - /* Allocate receive buffer, which will be sorted into an increasing - * order based on the file offsets. Thus, after sorting pack recv_buf - * to wr_buf to avoid creating another buffer datatype. - */ - if (buf_count > 0) { - recv_buf = (char*) NCI_Malloc(buf_count); - wr_buf = (char*) NCI_Malloc(buf_count); - } + return status; +} - /* First, pack self write data into front of the recv_buf */ - if (bufLen > 0) { - if (bufType == MPI_BYTE) - memcpy(recv_buf, buf, bufLen); - else { - void *inbuf = (buf == NULL) ? MPI_BOTTOM : buf; +/*----< ina_put() >----------------------------------------------------------*/ +/* This subroutine implements the intra-node aggregation for write operations. + */ +static +int ina_put(NC *ncp, + int is_incr, /* if offsets are incremental */ + MPI_Aint num_pairs, /* number of offset-length pairs */ #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count position=0; - MPI_Count incount = (buf == NULL) ? 1 : bufCount; - MPI_Pack_c(inbuf, incount, bufType, recv_buf, bufLen, &position, - MPI_COMM_SELF); + MPI_Count *offsets, + MPI_Count *lengths, #else - int position=0; - int incount = (buf == NULL) ? 1 : bufCount; - MPI_Pack(inbuf, incount, bufType, recv_buf, bufLen, &position, - MPI_COMM_SELF); + MPI_Offset *offsets, + int *lengths, #endif - } + PNCIO_View buf_view, + void *buf) /* user buffer */ +{ + int i, j, err, mpireturn, status=NC_NOERR, free_buf_view_off=0; + char *recv_buf=NULL, *wr_buf = NULL; + MPI_Aint npairs=0, *meta=NULL, *count=NULL, *bufAddr=NULL; + MPI_Offset wr_amnt=0; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *off_ptr, *len_ptr; +#else + MPI_Offset *off_ptr; + int *len_ptr; +#endif + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + double endT, startT = MPI_Wtime(); + MPI_Offset mem_max; + // ncmpi_inq_malloc_size(&mem_max); + ncmpi_inq_malloc_max_size(&mem_max); + ncp->maxmem_put[0] = MAX(ncp->maxmem_put[0], mem_max); +#endif + + /* buf may be noncontiguous ! */ + + /* Firstly, aggregators collect metadata from non-aggregators. + * + * This rank tells its aggregator how much metadata to receive from this + * rank, by sending: the number of offset-length pairs (num_pairs) and user + * buffer size in bytes (buf_view.size). This message size to be sent by + * this rank is 3 MPI_Offset. + */ + if (ncp->rank == ncp->my_aggr) + meta = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * ncp->num_nonaggrs * 3); + else + meta = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * 3); + + meta[0] = num_pairs; + meta[1] = buf_view.size; + meta[2] = is_incr; + + /* Each aggregator first collects metadata about its offset-length pairs, + * size of write request, and whether the offsets are in an incremental + * order. The aggregator will gather these metadata from non-aggregators + * assigned to it. + * For write operation, keeping the original offset-length pairs is not + * necessary, as they will later be sorted and coalesced before calling + * MPI-IO or PNCIO file write. + * + * Once ina_collect_md() returns, this aggregator's offsets and lengths may + * grow to include the ones from non-aggregators (appended). + */ + if (ncp->num_nonaggrs > 1) { + err = ina_collect_md(ncp, meta, &offsets, &lengths, &npairs); + if (err != NC_NOERR) { + NCI_Free(meta); + return err; } + } + else + npairs = num_pairs; - /* post requests to receive write data from non-aggregators */ - if (buf_count > 0) { - char *ptr = recv_buf + bufLen; - for (i=1; inum_nonaggrs; i++) { - if (msg[i*2 + 1] == 0) continue; + /* For write operation, the non-aggregators now can start sending their + * write data to the aggregator. + */ + if (ncp->rank != ncp->my_aggr) { /* non-aggregator */ + if (meta[0] > 0) { + /* Non-aggregators send write data to the aggregator */ #ifdef HAVE_MPI_LARGE_COUNT - MPI_Irecv_c(ptr, msg[i*2 + 1], MPI_BYTE, ncp->nonaggr_ranks[i], - 0, ncp->comm, &req[nreqs++]); + MPI_Count num = (buf_view.is_contig) ? buf_view.size : 1; + TRACE_COMM(MPI_Send_c)(buf, num, buf_view.type, ncp->my_aggr, + 0, ncp->comm); #else - MPI_Irecv(ptr, msg[i*2 + 1], MPI_BYTE, ncp->nonaggr_ranks[i], - 0, ncp->comm, &req[nreqs++]); + int num = (buf_view.is_contig) ? buf_view.size : 1; + TRACE_COMM(MPI_Send)(buf, num, buf_view.type, ncp->my_aggr, + 0, ncp->comm); #endif - ptr += msg[i*2 + 1]; - } - mpireturn = MPI_Waitall(nreqs, req, MPI_STATUSES_IGNORE); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall"); - /* return the first encountered error if there is any */ - if (status == NC_NOERR) status = err; - } } - NCI_Free(req); - NCI_Free(msg); + + /* Must free offsets and lengths now, as they may be realloc-ed in + * ina_collect_md() + */ + if (offsets != NULL) NCI_Free(offsets); + if (lengths != NULL) NCI_Free(lengths); + + /* Non-aggregators are done here, as only aggregators call MPI-IO/PNCIO + * functions to write data to the file. Non-aggregators do not + * participate MPI-IO calls. + */ + NCI_Free(meta); + return status; } - else if (bufLen > 0) { - /* send write data to the aggregator */ - void *buf_ptr = (buf == NULL) ? MPI_BOTTOM : buf; + + /* The remaining of this subroutine is for aggregators only */ + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + // ncmpi_inq_malloc_size(&mem_max); + ncmpi_inq_malloc_max_size(&mem_max); + ncp->maxmem_put[1] = MAX(ncp->maxmem_put[1], mem_max); + endT = MPI_Wtime(); + if (ncp->rank == ncp->my_aggr) ncp->ina_time_put[0] += endT - startT; + startT = endT; +#endif + + off_ptr = offsets; + len_ptr = lengths; + + /* MPI-IO has the following requirements about filetype. + * 1. The (flattened) displacements (of a filetype) are not required to be + * distinct, but they cannot be negative, and they must be monotonically + * non-decreasing. + * 2. If the file is opened for writing, neither the etype nor the filetype + * is permitted to contain overlapping regions. + */ + if (npairs > 0) { + /* Now this aggregator has received all offset-length pairs from its + * non-aggregators. At first, check if a sorting is necessary. + */ + char *ptr; + int nreqs, indv_sorted, do_sort, overlap; + MPI_Request *req=NULL; + MPI_Offset recv_amnt; + + /* check if offsets of all non-aggregators are individual sorted */ + indv_sorted = 1; + do_sort = 0; + for (i=-1,j=0; jnum_nonaggrs; j++) { + if (i == -1 && meta[j*3] > 0) /* find 1st whose num_pairs > 0 */ + i = j; + if (meta[j*3+2] == 0) { /* j's offsets are not sorted */ + indv_sorted = 0; + do_sort = 1; + break; + } + } + /* i is the first non-aggregator whose num_pairs > 0, and + * j is the first non-aggregator whose is_incr is false + */ +// printf("%s at %d: do_sort=%d indv_sorted=%d\n",__func__,__LINE__, do_sort,indv_sorted); + + if (i >= 0 && indv_sorted == 1) { + /* When all ranks' offsets are individually sorted, we still need + * to check if offsets are interleaved among all non-aggregators to + * determine whether a sort for all offset-length pairs is + * necessary. + */ #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count num = (buf == NULL) ? 1 : bufCount; - MPI_Send_c(buf_ptr, num, bufType, ncp->my_aggr, 0, ncp->comm); + MPI_Count prev_end_off; #else - int num = (buf == NULL) ? 1 : bufCount; - MPI_Send(buf_ptr, num, bufType, ncp->my_aggr, 0, ncp->comm); + MPI_Offset prev_end_off; #endif - NCI_Free(offsets); - NCI_Free(lengths); - } + assert(meta[i*3+2] == 1); + + MPI_Aint sum = meta[i*3]; + prev_end_off = off_ptr[sum-1]; /* last offset of non-aggregator i */ - /* aggregator sorts the offset-length pairs, along with the buffer */ - if (ncp->rank == ncp->my_aggr && npairs > 0) { + /* check if the offsets are interleaved */ + for (++i; inum_nonaggrs; i++) { + if (meta[i*3] == 0) /* zero-sized request */ + continue; + assert(meta[i*3+2] == 1); + + if (prev_end_off > off_ptr[sum]) { + /* off_ptr[sum] is the non-aggregator i' 1st offset */ + do_sort = 1; /* offsets are not incrementing */ + break; + } + /* move on to next non-aggregator */ + sum += meta[i*3]; + prev_end_off = off_ptr[sum-1]; + } + } - /* construct array of buffer addresses */ - MPI_Aint *bufAddr = (MPI_Aint*)NCI_Malloc(sizeof(MPI_Aint) * npairs); + if (do_sort && indv_sorted) { + /* Interleaved offsets are found but individual offsets are already + * sorted. This is commonly seen from the checkerboard domain + * partitioning pattern. In this case, heap_merge() must be called + * to merge all individually already-sorted offsets into one single + * sorted offset list. Note count[] is initialized and will be used + * in heap_merge() + */ + count = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint)*ncp->num_nonaggrs); + for (i=0; inum_nonaggrs; i++) count[i] = meta[i*3]; + } + + /* Construct an array of buffer addresses containing a mapping of the + * buffer used to receive write data from non-aggregators and the + * buffer used to write to file. bufAddr[] is calculated based on the + * assumption that the write buffer of this aggregator is contiguous, + * i.e. buf_view.is_contig being 1. For non-aggregators, their write + * data will always be received into a contiguous buffer. + */ + bufAddr = (MPI_Aint*)NCI_Malloc(sizeof(MPI_Aint) * npairs); bufAddr[0] = 0; for (i=1; inum_nonaggrs, count, npairs, off_ptr, len_ptr, + bufAddr); + NCI_Free(count); + } + else + /* When some individual offsets are not sorted, we cannot use + * heap_merge(). Note qsort() is an in-place sorting. + */ + qsort_off_len_buf(npairs, off_ptr, len_ptr, bufAddr); + } +// printf("%s at %d: do_sort=%d indv_sorted=%d\n",__func__,__LINE__, do_sort,indv_sorted); - /* merge the overlapped buffer segments, skip the overlapped regions - * for those with higher j indices (i.e. requests with lower j indices - * win the writes to the overlapped regions) + /* Now off_ptr and len_ptr are sorted, but overlaps may exist between + * adjacent pairs. If this is the case, they must be coalesced. + * + * Below loop checks if there is overlap and calculates recv_amnt and + * wr_amnt. + * recv_amnt is the total amount this aggregator will receive from + * non-aggregators, including self. recv_amnt includes overlaps. + * wr_amnt is recv_amnt with overlap removed. + * + * This loop also coalesces offset-length pairs as well as the + * corresponding buffer addresses, so they can be used to move write + * data around in the true write buffer. */ + overlap = 0; +int fake_overlap=0; + wr_amnt = recv_amnt = len_ptr[0]; for (i=0, j=1; j= offsets[j] + lengths[j]) + recv_amnt += len_ptr[j]; + if (off_ptr[i] + len_ptr[i] >= off_ptr[j] + len_ptr[j]) { + overlap = 1; +fake_overlap=1; /* segment i completely covers segment j, skip j */ continue; + } - MPI_Offset gap = offsets[i] + lengths[i] - offsets[j]; - if (gap >= 0) { /* segments i and j overlaps */ - if (bufAddr[i] + lengths[i] == bufAddr[j] + gap) { - /* buffers i and j are contiguous, merge j to i */ - lengths[i] += lengths[j] - gap; + MPI_Offset gap = off_ptr[i] + len_ptr[i] - off_ptr[j]; + if (gap >= 0) { /* overlap detected, merge j into i */ + /* when gap > 0, pairs i and j overlap + * when gap == 0, pairs i and j are contiguous + */ + if (gap > 0) overlap = 1; +if (gap >= 0) fake_overlap=1; + wr_amnt += len_ptr[j] - gap; + if (bufAddr[i] + len_ptr[i] == bufAddr[j] + gap) { + /* buffers i and j are contiguous, merge j into i */ + len_ptr[i] += len_ptr[j] - gap; } else { /* buffers are not contiguous, reduce j's len */ - offsets[i+1] = offsets[j] + gap; - lengths[i+1] = lengths[j] - gap; + off_ptr[i+1] = off_ptr[j] + gap; + len_ptr[i+1] = len_ptr[j] - gap; bufAddr[i+1] = bufAddr[j] + gap; i++; } } else { /* i and j do not overlap */ + wr_amnt += len_ptr[j]; i++; if (i < j) { - offsets[i] = offsets[j]; - lengths[i] = lengths[j]; + off_ptr[i] = off_ptr[j]; + len_ptr[i] = len_ptr[j]; bufAddr[i] = bufAddr[j]; } } } - /* update number of pairs, now all off-len pairs are not overlapped */ +/* +if (ncp->num_nonaggrs == 1 && do_sort == 1) printf("%s at %d: overlap=%d do_sort=%d after coalesce npairs changed from %ld to %d wr_amnt=%lld recv_amnt=%lld\n",__func__,__LINE__, overlap, do_sort,npairs,i+1,wr_amnt,recv_amnt); +*/ + +if (fake_overlap == 0) assert(npairs == i+1); + + /* Now off_ptr[], len_ptr[], bufAddr[] are coalesced and no overlap */ npairs = i+1; - /* pack recv_buf, data received from non-aggregators, into wr_buf, a - * contiguous buffer, wr_buf, which will later be used in a call to - * MPI_File_write_at_all() +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + // ncmpi_inq_malloc_size(&mem_max); + ncmpi_inq_malloc_max_size(&mem_max); + ncp->maxmem_put[2] = MAX(ncp->maxmem_put[2], mem_max); + + endT = MPI_Wtime(); + ncp->ina_time_put[1] += endT - startT; + ncp->ina_npairs_put = MAX(ncp->ina_npairs_put, npairs); + startT = endT; +#endif + + /* Allocate receive buffer. Once write data from non-aggregators have + * received into recv_buf, it is packed into wr_buf. Then, wr_buf is + * used to call MPI-IO/PNCIO file write. Note the wr_buf is always + * contiguous. + * + * When ncp->num_nonaggrs == 1, wr_buf is set to buf which is directly + * passed to MPI-IO/PNCIO file write. + * + * If file offset-length pairs have not been re-ordered, i.e. sorted + * and overlaps removed, and this aggregator will not receive any write + * data from its non-aggregators, then we can use user's buffer, buf, + * to call MPI-IO/PNCIO to write to the file, without allocating an + * additional temporary buffer. */ - char *ptr = wr_buf; - buf_count = 0; - if (npairs > 0) { - memcpy(ptr, recv_buf + bufAddr[0], lengths[0]); - ptr += lengths[0]; - buf_count = lengths[0]; - } - for (i=0, j=1; jmaxmem_put[3] = MAX(ncp->maxmem_put[3], mem_max); +#endif + + if (recv_buf != buf) { + /* Pack this aggregator's write data into front of recv_buf */ + if (buf_view.is_contig && buf_view.type == MPI_BYTE) + memcpy(recv_buf, buf, buf_view.size); else { - i++; - if (i < j) { - offsets[i] = offsets[j]; - lengths[i] = lengths[j]; - } +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count pos=0; + MPI_Count num = (buf_view.is_contig) ? buf_view.size : 1; + MPI_Pack_c(buf, num, buf_view.type, recv_buf, buf_view.size, + &pos, MPI_COMM_SELF); +#else + int pos=0; + MPI_Count num = (buf_view.is_contig) ? buf_view.size : 1; + MPI_Pack(buf, num, buf_view.type, recv_buf, buf_view.size, + &pos, MPI_COMM_SELF); +#endif } } - NCI_Free(bufAddr); - if (recv_buf != NULL) NCI_Free(recv_buf); - /* update number of pairs, now all off-len pairs are not overlapped */ - npairs = i+1; +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + endT = MPI_Wtime(); + ncp->ina_time_put[2] += endT - startT; + startT = endT; +#endif - if (npairs == 1) { - /* No need to create fileType if writing to a contiguous space */ - offset = offsets[0]; - } - else { + /* Receive write data sent from non-aggregators. Note we cannot move + * the posting of MPI_Irecv calls to before sorting and leave + * MPI_Waitall() to after sorting to overlap communication with the + * sorting, because the sorting determines the receive buffer size. + */ + req = (MPI_Request*)NCI_Malloc(sizeof(MPI_Request) * ncp->num_nonaggrs); + ptr = recv_buf + buf_view.size; + nreqs = 0; + for (i=1; inum_nonaggrs; i++) { + if (meta[i*3 + 1] == 0) continue; #ifdef HAVE_MPI_LARGE_COUNT - /* construct fileview */ - mpireturn = MPI_Type_create_hindexed_c(npairs, lengths, offsets, - MPI_BYTE, &fileType); - + TRACE_COMM(MPI_Irecv_c)(ptr, meta[i*3 + 1], MPI_BYTE, + ncp->nonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]); #else - /* construct fileview */ - mpireturn = MPI_Type_create_hindexed(npairs, lengths, offsets, - MPI_BYTE, &fileType); + TRACE_COMM(MPI_Irecv)(ptr, meta[i*3 + 1], MPI_BYTE, + ncp->nonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]); +#endif + ptr += meta[i*3 + 1]; + } + if (nreqs > 0) { +#ifdef HAVE_MPI_STATUSES_IGNORE + TRACE_COMM(MPI_Waitall)(nreqs, req, MPI_STATUSES_IGNORE); +#else + MPI_Status *statuses = (MPI_Status *) + NCI_Malloc(nreqs * sizeof(MPI_Status)); + TRACE_COMM(MPI_Waitall)(nreqs, req, statuses); + NCI_Free(statuses); #endif if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed"); + err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall"); /* return the first encountered error if there is any */ if (status == NC_NOERR) status = err; } - else { - mpireturn = MPI_Type_commit(&fileType); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_commit"); - /* return the first encountered error if there is any */ - if (status == NC_NOERR) status = err; - } + } + NCI_Free(req); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + endT = MPI_Wtime(); + if (ncp->rank == ncp->my_aggr) ncp->ina_time_put[3] += endT - startT; + startT = endT; +#endif + + /* Now all write data has been collected into recv_buf. In case of any + * overlap, we must coalesce recv_buf into wr_buf using off_ptr[], + * len_ptr[], and bufAddr[]. For overlapped regions, requests with + * lower j indices win the writes to the overlapped regions. + * + * In case the user buffer, buf, can not be used to write to the file, + * loop below packs recv_buf, data received from non-aggregators, into + * wr_buf, a contiguous buffer, wr_buf, which will later be used in a + * call to MPI-IO/PNCIO file write. + */ + if (!do_sort && wr_amnt == recv_amnt) { + wr_buf = recv_buf; + + if (wr_buf != buf) { + /* If write data has been packed in wr_buf, a contiguous buffer, + * update buf_view before passing it to the MPI-IO/PNCIO file + * write. + */ + buf_view.size = wr_amnt; + buf_view.type = MPI_BYTE; + buf_view.is_contig = 1; } + /* else case is when user's buffer, buf, can be used to write */ + } + else if (buf_view.is_contig && !overlap) { + /* Note we can reuse bufAddr[] and len_ptr[] as buf_view.off and + * buf_view.len only when buf_view.is_contig is true, because + * bufAddr[] is constructed based on the assumption that the write + * buffer is contiguous. + */ + wr_buf = recv_buf; + buf_view.size = wr_amnt; + buf_view.type = MPI_BYTE; + buf_view.is_contig = (npairs <= 1); + buf_view.len = len_ptr; + buf_view.count = npairs; +#if SIZEOF_MPI_AINT == SIZEOF_MPI_OFFSET + buf_view.off = (MPI_Offset*)bufAddr; /* based on recv_buf */ +#else + buf_view.off = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * npairs); + for (j=0; j 0) */ + + NCI_Free(meta); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + endT = MPI_Wtime(); + if (ncp->rank == ncp->my_aggr) ncp->ina_time_put[4] += endT - startT; +#endif + + /* set the fileview */ + err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, npairs, off_ptr, len_ptr); + if (err != NC_NOERR) { + if (status == NC_NOERR) status = err; + wr_amnt = 0; + } + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + // ncmpi_inq_malloc_size(&mem_max); + ncmpi_inq_malloc_max_size(&mem_max); + ncp->maxmem_put[4] = MAX(ncp->maxmem_put[4], mem_max); +#endif + + /* carry out write request to file */ + err = ncmpio_read_write(ncp, NC_REQ_WR, 0, buf_view, wr_buf); + if (status == NC_NOERR) status = err; + + if (free_buf_view_off) NCI_Free(buf_view.off); + if (wr_buf != buf) NCI_Free(wr_buf); + if (bufAddr != NULL) NCI_Free(bufAddr); + + /* Must free offsets and lengths now, as they may be realloc-ed in + * ina_collect_md() + */ + if (offsets != NULL) NCI_Free(offsets); + if (lengths != NULL) NCI_Free(lengths); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + // ncmpi_inq_malloc_size(&mem_max); + ncmpi_inq_malloc_max_size(&mem_max); + ncp->maxmem_put[5] = MAX(ncp->maxmem_put[5], mem_max); +#endif + + return status; +} + +static +size_t bin_search( +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count key, MPI_Count *base, +#else + MPI_Offset key, MPI_Offset *base, +#endif + size_t nmemb) +{ + size_t low, high; + + /* only one element */ + if (nmemb == 1) + return (base[0] <= key) ? 0 : -1; + + /* check the 1st element */ + if (base[0] <= key && key < base[1]) + return 0; + + low = 1; + high = nmemb - 1; + + while (low <= high) { + size_t mid = low + (high - low) / 2; + if (base[mid] == key) + return mid; + if (base[mid] < key) + low = mid + 1; + else + high = mid - 1; + } + return (low - 1); +} + +/*----< ina_get() >----------------------------------------------------------*/ +/* This subroutine implements the intra-node aggregation for read operations. + */ +static +int ina_get(NC *ncp, + int is_incr, /* if offsets are incremental */ + MPI_Aint num_pairs, /* number of offset-length pairs */ +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *offsets, + MPI_Count *lengths, +#else + MPI_Offset *offsets, + int *lengths, +#endif + PNCIO_View buf_view, + void *buf) /* user buffer */ +{ + int i, j, err, mpireturn, status=NC_NOERR, nreqs; + int do_sort=0, indv_sorted=1, overlap=0; + char *rd_buf = NULL; + MPI_Aint npairs=0, max_npairs, *meta=NULL, *count=NULL; + MPI_Offset send_amnt=0, rd_amnt=0, off_start; + MPI_Request *req=NULL; + PNCIO_View rd_buf_view; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *off_ptr, *len_ptr, *orig_off_ptr, *orig_len_ptr; + MPI_Count bufLen, *orig_offsets=NULL, *orig_lengths=NULL; + MPI_Count *blks = NULL, *disps = NULL; +#else + MPI_Offset *orig_offsets=NULL, *orig_off_ptr, *off_ptr; + int bufLen, *orig_lengths=NULL, *orig_len_ptr, *len_ptr, *blks = NULL; + MPI_Aint *disps = NULL; +#endif + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + double endT, startT = MPI_Wtime(); + MPI_Offset mem_max; + // ncmpi_inq_malloc_size(&mem_max); + ncmpi_inq_malloc_max_size(&mem_max); + ncp->maxmem_get[0] = MAX(ncp->maxmem_get[0], mem_max); +#endif + + bufLen = buf_view.size; + + /* Firstly, aggregators collect metadata from non-aggregators. + * + * This rank tells its aggregator how much metadata to receive from this + * rank, by sending + * 1. the number of offset-length pairs (num_pairs) + * 2. user buffer size in bytes (bufLen). + * 3. whether this rank's offsets are sorted in increasing order. + * This message size to be sent by this rank is 3 MPI_Offset. + */ + if (ncp->rank == ncp->my_aggr) + meta = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * ncp->num_nonaggrs * 3); + else + meta = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * 3); + + meta[0] = num_pairs; + meta[1] = bufLen; + meta[2] = is_incr; + + /* Each aggregator first collects metadata about its offset-length pairs, + * size of read request, and whether the offsets are in an incremental + * order. The aggregator will gather these metadata from non-aggregators + * assigned to it. + * + * Once ina_collect_md() returns, this aggregator's offsets and lengths may + * grow to include the ones from non-aggregators (appended). + */ + if (ncp->num_nonaggrs > 1) { + err = ina_collect_md(ncp, meta, &offsets, &lengths, &npairs); + if (err != NC_NOERR) { + NCI_Free(meta); + return err; } - NCI_Free(offsets); - NCI_Free(lengths); } + else + npairs = num_pairs; + + if (ncp->rank != ncp->my_aggr) { + if (meta[0] > 0) { + /* For read operation, the non-aggregators now can start receiving + * their read data from the aggregator. + */ + MPI_Status st; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count num = (buf_view.is_contig) ? buf_view.size : 1; + TRACE_COMM(MPI_Recv_c)(buf, num, buf_view.type, ncp->my_aggr, 0, + ncp->comm, &st); +#else + int num = (buf_view.is_contig) ? buf_view.size : 1; + TRACE_COMM(MPI_Recv)(buf, num, buf_view.type, ncp->my_aggr, 0, + ncp->comm, &st); +#endif + } + + /* Must free offsets and lengths now, as they may be realloc-ed in + * ina_collect_md() + */ + if (offsets != NULL) NCI_Free(offsets); + if (lengths != NULL) NCI_Free(lengths); + + /* Non-aggregators are now done, as they do not participate MPI-IO or + * PNCIO file read. + */ + NCI_Free(meta); + return status; + } + + /* The remaining of this subroutine is for aggregators only. */ + + /* For read operation, the original offsets and lengths must be kept + * untouched, because the later sorting and coalescing will mess up the + * original order of offsets and lengths, which are needed to construct a + * datatype when an aggregator sends read data to its non-aggregators. + */ +#ifdef HAVE_MPI_LARGE_COUNT + orig_offsets = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * npairs); + orig_lengths = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * npairs); + memcpy(orig_offsets, offsets, sizeof(MPI_Count) * npairs); + memcpy(orig_lengths, lengths, sizeof(MPI_Count) * npairs); +#else + orig_offsets = (MPI_Offset*) NCI_Malloc(sizeof(MPI_Offset) * npairs); + orig_lengths = (int*) NCI_Malloc(sizeof(int) * npairs); + memcpy(orig_offsets, offsets, sizeof(MPI_Offset) * npairs); + memcpy(orig_lengths, lengths, sizeof(int) * npairs); +#endif + orig_off_ptr = orig_offsets; + orig_len_ptr = orig_lengths; + off_ptr = offsets; + len_ptr = lengths; + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + // ncmpi_inq_malloc_size(&mem_max); + ncmpi_inq_malloc_max_size(&mem_max); + ncp->maxmem_get[1] = MAX(ncp->maxmem_get[1], mem_max); +#endif + + /* MPI-IO has the following requirements about filetype. + * 1. The (flattened) displacements (of a filetype) are not required to be + * distinct, but they cannot be negative, and they must be monotonically + * non-decreasing. + * 2. If the file is opened for writing, neither the etype nor the filetype + * is permitted to contain overlapping regions. + */ + if (npairs > 0) { + /* Now this aggregator has received all offset-length pairs from its + * non-aggregators. At first, check if a sorting is necessary. + */ + + /* check if offsets of all non-aggregators are individual sorted */ + indv_sorted = 1; + for (i=-1,j=0; jnum_nonaggrs; j++) { + if (i == -1 && meta[j*3] > 0) /* find 1st whose num_pairs > 0 */ + i = j; + if (meta[j*3+2] == 0) { /* j's offsets are not sorted */ + indv_sorted = 0; + do_sort = 1; + break; + } + } + /* i is the first non-aggregator whose num_pairs > 0 + * j is the first non-aggregator whose is_incr is false + */ + + if (i >= 0 && indv_sorted == 1) { + /* When all ranks' offsets are individually sorted, we still need + * to check if offsets are interleaved among all non-aggregators to + * determine whether a sort for all offset-length pairs is + * necessary. + */ +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count prev_end_off; +#else + MPI_Offset prev_end_off; +#endif + assert(meta[i*3+2] == 1); + + MPI_Aint sum = meta[i*3]; + prev_end_off = off_ptr[sum-1]; /* last offset of non-aggregator i */ + + /* check if the offsets are interleaved */ + for (++i; inum_nonaggrs; i++) { + if (meta[i*3] == 0) /* zero-sized request */ + continue; + assert(meta[i*3+2] == 1); + if (prev_end_off > off_ptr[sum]) { + /* off_ptr[sum] is the non-aggregator i' 1st offset */ + do_sort = 1; /* offsets are not incrementing */ + break; + } + /* move on to next non-aggregator */ + sum += meta[i*3]; + prev_end_off = off_ptr[sum-1]; + } + } + + if (do_sort && indv_sorted) { + /* Interleaved offsets are found but individual offsets are already + * sorted. In this case, heap_merge() is called to merge all + * offsets into one single sorted offset list. Note count[] is + * initialized and will be used in heap_merge() + */ + count = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint)* ncp->num_nonaggrs); + for (i=0; inum_nonaggrs; i++) count[i] = meta[i*3]; + } + + /* Construct an array of buffer addresses containing a mapping of the + * buffer used to receive write data from non-aggregators and the + * buffer used to write to file. + */ + if (do_sort) { + /* Sort offsets and lengths, based on offsets into an increasing + * order. + */ + if (indv_sorted) { + /* heap-merge() runs much faster than qsort() when individual + * lists have already been sorted. However, it has a much + * bigger memory footprint. + */ + heap_merge(ncp->num_nonaggrs, count, npairs, off_ptr, len_ptr, + NULL); + NCI_Free(count); + } + else + /* When some individual offsets are not sorted, we cannot use + * heap_merge(). Note qsort() is an in-place sorting. + */ + qsort_off_len_buf(npairs, off_ptr, len_ptr, NULL); + } #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) - ncp->aggr_time += MPI_Wtime() - timing; + // ncmpi_inq_malloc_size(&mem_max); + ncmpi_inq_malloc_max_size(&mem_max); + ncp->maxmem_get[2] = MAX(ncp->maxmem_get[2], mem_max); + ncp->ina_npairs_get = MAX(ncp->ina_npairs_get, npairs); #endif - if (ncp->rank != ncp->my_aggr) /* non-aggregator writes nothing */ - buf_count = 0; + /* Coalesce the offset-length pairs and calculate the total read amount + * and send amount by this aggregator. + */ + overlap = 0; + send_amnt = rd_amnt = len_ptr[0]; + for (i=0, j=1; j= 0) { /* overlap detected, merge j into i */ + /* when gap > 0, pairs i and j overlap + * when gap == 0, pairs i and j are contiguous + */ + MPI_Offset i_end, j_end; + + if (gap > 0) overlap = 1; + + i_end = off_ptr[i] + len_ptr[i]; + j_end = off_ptr[j] + len_ptr[j]; + if (i_end < j_end) { + len_ptr[i] += j_end - i_end; + rd_amnt += j_end - i_end; + } + /* else: j is entirely covered by i */ + } + else { /* j and i are not overlapped */ + rd_amnt += len_ptr[j]; + i++; + if (i < j) { + off_ptr[i] = off_ptr[j]; + len_ptr[i] = len_ptr[j]; + } + } + } + + /* update npairs after coalesce */ + npairs = i+1; - /* Only aggregators writes non-zero sized of data to the file. The - * non-aggregators participate the collective write call with zero-length - * write requests. +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + // ncmpi_inq_malloc_size(&mem_max); + ncmpi_inq_malloc_max_size(&mem_max); + ncp->maxmem_get[3] = MAX(ncp->maxmem_get[3], mem_max); +#endif + } /* if (npairs > 0) */ + /* else case: This aggregation group may not have data to read, but must + * participate the collective MPI-IO calls. */ - fh = ncp->collective_fh; - /* set the MPI-IO fileview, this is a collective call */ - err = ncmpio_file_set_view(ncp, fh, &offset, fileType); - if (fileType != MPI_BYTE) MPI_Type_free(&fileType); + /* set the fileview */ + err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, npairs, off_ptr, len_ptr); if (err != NC_NOERR) { if (status == NC_NOERR) status = err; - buf_count = 0; + rd_amnt = 0; + } + + /* Allocate read buffer and send buffer. Once data are read from file into + * rd_buf, it is unpacked into send_buf for each non-aggregator. send_buf + * will be directly used to send the read request data to non-aggregators. + * + * Note rd_amnt may not be the same as send_amnt, as there can be overlaps + * between adjacent offset-length pairs after sorted. + * + * If file offset-length pairs have not been re-ordered, i.e. sorted and + * overlaps removed, and this aggregator will not send any read data to its + * non-aggregators, then we can use user's buffer, buf, to call + * MPI-IO/PNCIO to read from the file, without allocating an additional + * temporary buffer. + */ + if (!do_sort && buf_view.size == send_amnt && !overlap) { + rd_buf_view = buf_view; + rd_buf = buf; + } + else { + /* Read data will be stored in a contiguous read buffer. */ + rd_buf_view.size = rd_amnt; + rd_buf_view.type = MPI_BYTE; + rd_buf_view.is_contig = 1; + if (rd_amnt > 0) + rd_buf = (char*) NCI_Malloc(rd_amnt); } - /* call MPI_File_write_at_all */ - err = ncmpio_read_write(ncp, NC_REQ_WR, NC_REQ_COLL, offset, buf_count, - MPI_BYTE, wr_buf, 1); +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + // ncmpi_inq_malloc_size(&mem_max); + ncmpi_inq_malloc_max_size(&mem_max); + ncp->maxmem_get[4] = MAX(ncp->maxmem_get[4], mem_max); + endT = MPI_Wtime(); + ncp->ina_time_get[0] += endT - startT; +#endif + + err = ncmpio_read_write(ncp, NC_REQ_RD, 0, rd_buf_view, rd_buf); if (status == NC_NOERR) status = err; - if (wr_buf != NULL) NCI_Free(wr_buf); +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + // ncmpi_inq_malloc_size(&mem_max); + ncmpi_inq_malloc_max_size(&mem_max); + ncp->maxmem_get[5] = MAX(ncp->maxmem_get[5], mem_max); + startT = MPI_Wtime(); +#endif + + /* If sorting has been performed, the orders of off_ptr[] and len_ptr[] may + * no longer be the same as the original ones. We must use binary search to + * find the aggregated offset-length pair containing each non-aggregator's + * offset-length pair to construct a send buffer datatype, a view layout to + * the read buffer, rd_buf, so the data can be directly sent from rd_buf. + */ + if (rd_buf != buf) { + /* First, aggregators copy the read data to their own user buffer. + * Note off_ptr[] is sorted in an incremental order. + * + * When the offset-length pairs of read buffer have been sorted or + * the read buffer size is smaller than the total get amount, we must + * search and copy from read buffer to self's user buffer. + */ + char *ptr=NULL, *tmp_buf=NULL; + size_t m=0, k, scan_off=0; + + /* If this aggregator's user buftype is contiguous, the reuse its + * read buffer. If not, allocate a temporary buffer, copy the read + * data over, and then unpacking it to the user buffer. + */ + if (buf_view.is_contig) + ptr = buf; + else if (bufLen > 0) + ptr = tmp_buf = (char*) NCI_Malloc(bufLen); + + for (j=0; j 0 && !buf_view.is_contig) { +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count pos=0; + MPI_Unpack_c(tmp_buf, bufLen, &pos, buf, 1, buf_view.type, + MPI_COMM_SELF); +#else + int pos=0; + MPI_Unpack(tmp_buf, bufLen, &pos, buf, 1, buf_view.type, + MPI_COMM_SELF); +#endif + NCI_Free(tmp_buf); + } + } + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + endT = MPI_Wtime(); + ncp->ina_time_get[1] += endT - startT; + startT = endT; +#endif + + if (ncp->num_nonaggrs == 1) + /* In this case, communication will not be necessary. */ + goto fn_exit; + + /* Aggregators start sending read data to non-aggregators. At first, + * allocate array_of_blocklengths[] and array_of_displacements[] + */ + for (max_npairs=0, i=1; inum_nonaggrs; i++) + max_npairs = MAX(meta[3*i], max_npairs); + +#ifdef HAVE_MPI_LARGE_COUNT + blks = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * max_npairs); + disps = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * max_npairs); +#else + blks = (int*) NCI_Malloc(sizeof(int) * max_npairs); + disps = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * max_npairs); +#endif + + /* Now, send data to each non-aggregator */ + req = (MPI_Request*)NCI_Malloc(sizeof(MPI_Request) * ncp->num_nonaggrs); + nreqs = 0; + off_start = meta[0]; + for (i=1; inum_nonaggrs; i++) { + /* populate disps[] and blks[] */ + MPI_Aint remote_num_pairs = meta[3*i]; + MPI_Aint remote_is_incr = meta[3*i+2]; + + if (remote_num_pairs == 0) continue; /* zero sized request */ + +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *off = orig_off_ptr + off_start; + MPI_Count *len = orig_len_ptr + off_start; +#else + MPI_Offset *off = orig_off_ptr + off_start; + int *len = orig_len_ptr + off_start; +#endif + size_t k, m = 0; + size_t scan_off = 0; + for (j=0; jnonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]); +#else + TRACE_COMM(MPI_Isend)(MPI_BOTTOM, 1, sendType, + ncp->nonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]); +#endif + MPI_Type_free(&sendType); + } + } +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + endT = MPI_Wtime(); + ncp->ina_time_get[2] += endT - startT; + startT = endT; +#endif + + if (nreqs > 0) { +#ifdef HAVE_MPI_STATUSES_IGNORE + TRACE_COMM(MPI_Waitall)(nreqs, req, MPI_STATUSES_IGNORE); +#else + MPI_Status *statuses = (MPI_Status *) + NCI_Malloc(nreqs * sizeof(MPI_Status)); + TRACE_COMM(MPI_Waitall)(nreqs, req, statuses); + NCI_Free(statuses); +#endif + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall"); + /* return the first encountered error if there is any */ + if (status == NC_NOERR) status = err; + } + } + NCI_Free(blks); + NCI_Free(disps); + +fn_exit: + /* offsets[] and lengths[] are used in PNCIO read subroutines as flattened + * filetype. They cannot be freed before the I/O is done. + */ + if (rd_buf != NULL && rd_buf != buf) NCI_Free(rd_buf); + if (orig_lengths != NULL) NCI_Free(orig_lengths); + if (orig_offsets != NULL) NCI_Free(orig_offsets); + if (req != NULL) NCI_Free(req); + if (meta != NULL) NCI_Free(meta); + + /* Must free offsets and lengths now, as they may be realloc-ed in + * ina_collect_md() + */ + if (offsets != NULL) NCI_Free(offsets); + if (lengths != NULL) NCI_Free(lengths); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + endT = MPI_Wtime(); + ncp->ina_time_get[3] += endT - startT; +#endif return status; } -/*----< ncmpio_intra_node_aggregation_nreqs() >------------------------------*/ -/* This is a collective call */ +/*----< req_compare() >------------------------------------------------------*/ +/* used to sort the the string file offsets of reqs[] */ +static int +req_compare(const void *a, const void *b) +{ + if (((NC_req*)a)->offset_start > ((NC_req*)b)->offset_start) return (1); + if (((NC_req*)a)->offset_start < ((NC_req*)b)->offset_start) return (-1); + return (0); +} + +/*----< ncmpio_ina_nreqs() >-------------------------------------------------*/ +/* This subroutine handles PnetCDF's requests made from non-blocking APIs, + * which contain multiple requests to one or more variable. The input arguments + * are described below. + * reqMode: NC_REQ_RD for read request and NC_REQ_WR for write. + * num_reqs: number of elements in array req_list. + * req_list[]: stores pending requests from non-blocking API calls, which is + * used to construct file offset-length pairs and user buffer + * datatype. + * newnumrecs: number of new records + */ int -ncmpio_intra_node_aggregation_nreqs(NC *ncp, - int reqMode, - int num_reqs, - NC_req *put_list, - MPI_Offset newnumrecs) +ncmpio_ina_nreqs(NC *ncp, + int reqMode, + int num_reqs, + NC_req *req_list, + MPI_Offset newnumrecs) { - int err, status=NC_NOERR; - MPI_Aint bufLen, num_pairs; + int err, status=NC_NOERR, is_incr=1; + void *buf=NULL; + MPI_Aint num_pairs; #ifdef HAVE_MPI_LARGE_COUNT MPI_Count *offsets=NULL, *lengths=NULL; #else - MPI_Aint *offsets=NULL; + MPI_Offset *offsets=NULL; int *lengths=NULL; #endif - MPI_Datatype bufType=MPI_BYTE; #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) double timing = MPI_Wtime(); #endif - /* currently supports write requests only */ - if (fIsSet(reqMode, NC_REQ_RD)) return NC_NOERR; +// printf("%s at %d: rank=%d num_aggrs_per_nod =%d my_aggr=%d num_nonaggrs=%d\n",__func__,__LINE__, ncp->rank, ncp->num_aggrs_per_node, ncp->my_aggr, ncp->num_nonaggrs); + + /* populate reqs[].offset_start, starting offset of each request */ + NC_req *reqs = req_list; + int i, descreasing=0; + for (i=0; imy_aggr >= 0); + lead = (reqMode == NC_REQ_RD) ? ncp->get_lead_list + : ncp->put_lead_list; + lead += reqs[i].lead_off; + varp = lead->varp; + + if (varp->ndims == 0) { /* scalar variable */ + reqs[i].offset_start += varp->begin; + } + else if (reqs[i].npairs == 1) { /* only one offset-length pair */ + MPI_Offset off = varp->begin; + + if (IS_RECVAR(varp)) off += reqs[i].start[0] * ncp->recsize; + +// printf("%s at %d: num_reqs=%d reqs[%d].npairs == 1 offset_start=%lld off=%lld\n", __func__,__LINE__,num_reqs,i,reqs[i].offset_start,off); + reqs[i].offset_start += off; + } + else { + /* start/count/stride have been allocated in a contiguous array */ + MPI_Offset *count, *stride, offset_end; + count = reqs[i].start + varp->ndims; + stride = (fIsSet(lead->flag, NC_REQ_STRIDE_NULL)) ? NULL : + count + varp->ndims; + + /* calculate access range of this request */ + ncmpio_calc_start_end(ncp, varp, reqs[i].start, count, stride, + &reqs[i].offset_start, &offset_end); + } + /* check if offset_start are in a monotonic nondecreasing order */ + if (i > 0 && reqs[i].offset_start < reqs[i-1].offset_start) + descreasing = 1; + } + + /* If a decreasing order is found, sort reqs[] based on reqs[].offset_start + * into an increasing order. + */ + if (descreasing) + qsort(reqs, (size_t)num_reqs, sizeof(NC_req), req_compare); + +// printf("%s at %d: descreasing=%d\n",__func__,__LINE__, descreasing); /* construct file offset-length pairs * num_pairs: total number of off-len pairs * offsets: array of flattened offsets * lengths: array of flattened lengths + * is_incr: whether offsets are incremental */ if (num_reqs > 0) - flatten_reqs(ncp, num_reqs, put_list, &num_pairs, &offsets, &lengths); + flatten_reqs(ncp, reqMode, num_reqs, reqs, &is_incr, &num_pairs, + &offsets, &lengths); else num_pairs = 0; - /* construct write buffer datatype, bufType. - * bufLen is the buffer size in bytes +#if 0 +if (0 && num_pairs==10) printf("%s at %d: num_reqs=%d num_pairs=%ld off=%lld %lld %lld %lld %lld %lld %lld %lld %lld %lld len=%lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",__func__,__LINE__, num_reqs, num_pairs, +offsets[0],offsets[1],offsets[2],offsets[3],offsets[4],offsets[5], +offsets[6],offsets[7],offsets[8],offsets[9], +lengths[0],lengths[1],lengths[2],lengths[3],lengths[4],lengths[5], +lengths[6],lengths[7],lengths[8],lengths[9]); + +else if (num_pairs==12) printf("%s at %d: num_reqs=%d num_pairs=%ld off=%lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld len=%lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",__func__,__LINE__, num_reqs, num_pairs, +offsets[0],offsets[1],offsets[2],offsets[3],offsets[4], +offsets[5],offsets[6],offsets[7],offsets[8],offsets[9], +offsets[10],offsets[11], +lengths[0],lengths[1],lengths[2],lengths[3],lengths[4], +lengths[5],lengths[6],lengths[7],lengths[8],lengths[9], +lengths[10],lengths[11]); +else if (num_pairs) printf("%s at %d: num_reqs=%d num_pairs=%ld off=%lld len=%lld\n",__func__,__LINE__, num_reqs, num_pairs,offsets[0],lengths[0]); +#endif + + /* Populate buf_view, which contains metadata of the user buffers in the + * nonblocking requests. If buf is non-contiguous, buf to NULL and + * buf_view.type will be a derived datatype constructed using MPI_BOTTOM. */ - if (num_reqs > 0) { - construct_buf_type(ncp, num_reqs, put_list, &bufLen, &bufType); - bufLen = 1; - } - else - bufLen = 0; + PNCIO_View buf_view; + err = flat_buf_type(ncp, reqMode, num_reqs, reqs, &buf_view, &buf); + if (status == NC_NOERR) status = err; +if (num_reqs > 0) assert(buf != NULL); + +#if 0 +if (buf_view.count > 1) printf("%s at %d: buf_view count=%lld off=%lld %lld len=%lld %lld\n",__func__,__LINE__, buf_view.count, buf_view.off[0], buf_view.off[1], buf_view.len[0],buf_view.len[1]); +else if (buf_view.count) printf("%s at %d: buf_view count=%lld off=%lld len=%lld\n",__func__,__LINE__, buf_view.count, buf_view.off[0], buf_view.len[0]); + +{int *wkl; +int nelems, j,k, xsz=4; +char *xbuf, msg[1024],str[64]; +printf("%s at %d: buf_view count=%lld size=%lld\n",__func__,__LINE__, buf_view.count,buf_view.size); + wkl = (int*) malloc(buf_view.size); + nelems=buf_view.size/xsz; + xbuf = buf; + memcpy(wkl, xbuf, buf_view.size); ncmpii_in_swapn(wkl, nelems, xsz); + sprintf(msg,"%s at %d: nelems=%d buf=(%p) ",__func__,__LINE__, nelems, xbuf); + for (k=0; kaggr_time += MPI_Wtime() - timing; + if (ncp->rank == ncp->my_aggr) ncp->ina_time_flatten += MPI_Wtime() - timing; #endif - err = intra_node_aggregation(ncp, num_pairs, offsets, lengths, bufLen, - bufType, NULL); + int saved_my_aggr, saved_num_nonaggrs; + saved_my_aggr = ncp->my_aggr; + saved_num_nonaggrs = ncp->num_nonaggrs; + if (ncp->num_aggrs_per_node == 0 || fIsSet(ncp->flags, NC_MODE_INDEP)) { + /* Temporarily set ncp->my_aggr and ncp->num_nonaggrs to be as if + * self rank is an INA aggregator and the INA group size is 1. + */ + ncp->my_aggr = ncp->rank; + ncp->num_nonaggrs = 1; + } + +// printf("%s at %d: is_incr=%d buf=%p\n",__func__,__LINE__, is_incr,buf); + /* perform intra-node aggregation */ + if (fIsSet(reqMode, NC_REQ_WR)) + err = ina_put(ncp, is_incr, num_pairs, offsets, lengths, buf_view, buf); + else + err = ina_get(ncp, is_incr, num_pairs, offsets, lengths, buf_view, buf); if (status == NC_NOERR) status = err; - /* free and reset bufType */ - if (bufType != MPI_BYTE && bufType != MPI_DATATYPE_NULL) - MPI_Type_free(&bufType); + if (ncp->num_aggrs_per_node == 0 || fIsSet(ncp->flags, NC_MODE_INDEP)) { + /* restore ncp->my_aggr and ncp->num_nonaggrs */ + ncp->my_aggr = saved_my_aggr; + ncp->num_nonaggrs = saved_num_nonaggrs; + } + +#if 0 +if (fIsSet(reqMode, NC_REQ_RD)) +{int *wkl; +int nelems, j,k, xsz=4; +char *xbuf, msg[1024],str[64]; +printf("%s at %d: buf_view count=%lld size=%lld\n",__func__,__LINE__, buf_view.count,buf_view.size); + wkl = (int*) malloc(buf_view.size); + nelems=buf_view.size/xsz; + xbuf = buf; + memcpy(wkl, xbuf, buf_view.size); ncmpii_in_swapn(wkl, nelems, xsz); + sprintf(msg,"%s at %d: nelems=%d buf=(%p) ",__func__,__LINE__, nelems, xbuf); + for (k=0; k------------------------------------*/ -/* This is a collective call */ +/*----< ncmpio_ina_req() >---------------------------------------------------*/ +/* This subroutine handles a single request made by blocking APIs, involving + * only one variable. Below describe the subroutine arguments. + * reqMode: NC_REQ_RD for read request and NC_REQ_WR for write. + * varp: pointer to the variable struct. + * start[]: starting offsets + * count[]: counts along each dimension + * stride[]: stride along each dimension + * buf_len: size of I/O buffer in bytes + * buf: pointer to the user buffer + */ int -ncmpio_intra_node_aggregation(NC *ncp, - int reqMode, - NC_var *varp, - const MPI_Offset *start, - const MPI_Offset *count, - const MPI_Offset *stride, - MPI_Offset bufCount, - MPI_Datatype bufType, - void *buf) +ncmpio_ina_req(NC *ncp, + int reqMode, + NC_var *varp, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + MPI_Offset buf_len, + void *buf) { - int err, status=NC_NOERR; + int err, status=NC_NOERR, is_incr=1; MPI_Aint num_pairs; + PNCIO_View buf_view; #ifdef HAVE_MPI_LARGE_COUNT MPI_Count *offsets=NULL, *lengths=NULL; #else - MPI_Aint *offsets=NULL; + MPI_Offset *offsets=NULL; int *lengths=NULL; #endif #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) double timing = MPI_Wtime(); #endif - /* currently supports write requests only */ - if (fIsSet(reqMode, NC_REQ_RD)) return NC_NOERR; - - if (buf == NULL) /* zero-length request */ - return intra_node_aggregation(ncp, 0, NULL, NULL, 0, MPI_BYTE, NULL); - - /* construct file offset-length pairs - * num_pairs: total number of off-len pairs - * offsets: array of flattened offsets - * lengths: array of flattened lengths - */ - err = flatten_req(ncp, varp, start, count, stride, &num_pairs, &offsets, - &lengths); - if (err != NC_NOERR) { + /* blocking API's buffer passed here is always contiguous */ + buf_view.type = MPI_BYTE; + buf_view.is_contig = 1; + buf_view.size = buf_len; + buf_view.count = 0; + buf_view.off = NULL; + buf_view.len = NULL; + +// printf("%s at %d: buf=%s\n",__func__,__LINE__, (buf==NULL)?"NULL":"NOT NULL"); + if (buf_len == 0 || buf == NULL) { + /* This is a zero-length request. When in collective data mode, this + * rank must still participate collective calls. When INA is enabled, + * this rank tells its aggregator that it has no I/O data. When INA is + * disabled, this rank must participate other collective file call. + */ num_pairs = 0; - if (offsets != NULL) - NCI_Free(offsets); - offsets = NULL; + buf_view.size = 0; + buf_view.count = 0; } - status = err; + else { + /* construct file access offset-length pairs + * num_pairs: total number of off-len pairs + * offsets: array of flattened offsets + * lengths: array of flattened lengths + * is_incr: whether offsets are incremental + */ + err = flatten_req(ncp, varp, start, count, stride, &is_incr, + &num_pairs, &offsets, &lengths); + if (err != NC_NOERR) { /* make this rank zero-sized request */ + is_incr = 1; + num_pairs = 0; + buf_len = 0; + buf_view.size = 0; + buf_view.count = 0; + if (offsets != NULL) NCI_Free(offsets); + if (lengths != NULL) NCI_Free(lengths); + offsets = NULL; + lengths = NULL; + } + status = err; + } +// if (num_pairs > 0) printf("%s at %d: num_pairs=%ld off=%lld len=%lld\n",__func__,__LINE__, num_pairs,offsets[0],lengths[0]); #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) - ncp->aggr_time += MPI_Wtime() - timing; + if (ncp->rank == ncp->my_aggr) + ncp->ina_time_flatten += MPI_Wtime() - timing; #endif - err = intra_node_aggregation(ncp, num_pairs, offsets, lengths, bufCount, - bufType, buf); - if (status == NC_NOERR) status = err; + int saved_my_aggr, saved_num_nonaggrs; + saved_my_aggr = ncp->my_aggr; + saved_num_nonaggrs = ncp->num_nonaggrs; + if (ncp->num_aggrs_per_node == 0 || fIsSet(ncp->flags, NC_MODE_INDEP)) { + /* Temporarily set ncp->my_aggr and ncp->num_nonaggrs to be as if + * self rank is an INA aggregator and the INA group size is 1. + */ + ncp->my_aggr = ncp->rank; + ncp->num_nonaggrs = 1; + } +// if (num_pairs) printf("%s at %d: num_pairs=%ld off=%lld len=%lld\n",__func__,__LINE__, num_pairs,offsets[0],lengths[0]); +// if (buf_view.count) printf("%s at %d: buf_view count=%lld off=%lld len=%lld\n",__func__,__LINE__, buf_view.count, buf_view.off[0], buf_view.len[0]); + +// printf("%s at %d: buf_view count=%lld size=%lld is_contig=%d buf=%p\n",__func__,__LINE__, buf_view.count,buf_view.size,buf_view.is_contig,buf); + /* perform intra-node aggregation */ + if (fIsSet(reqMode, NC_REQ_WR)) { + err = ina_put(ncp, is_incr, num_pairs, offsets, lengths, buf_view, buf); + if (status == NC_NOERR) status = err; + } + else { + err = ina_get(ncp, is_incr, num_pairs, offsets, lengths, buf_view, buf); + if (status == NC_NOERR) status = err; + } + +#if 0 +if (fIsSet(reqMode, NC_REQ_RD)) +{unsigned long long *wkl; int xsz=8; // int *wkl; int xsz=4; +int nelems, j,k; +char *xbuf, msg[1024],str[64]; +printf("%s at %d: buf_view count=%lld size=%lld\n",__func__,__LINE__, buf_view.count,buf_view.size); + wkl = (unsigned long long*) malloc(buf_view.size); // wkl = (int*) malloc(buf_view.size); + nelems=buf_view.size/xsz; + xbuf = buf; + memcpy(wkl, xbuf, buf_view.size); ncmpii_in_swapn(wkl, nelems, xsz); + sprintf(msg,"%s at %d: %s nelems=%d buf=(%p) ",__func__,__LINE__, ncp->path,nelems, xbuf); + // for (k=0; knum_aggrs_per_node == 0 || fIsSet(ncp->flags, NC_MODE_INDEP)) { + /* restore ncp->my_aggr and ncp->num_nonaggrs */ + ncp->my_aggr = saved_my_aggr; + ncp->num_nonaggrs = saved_num_nonaggrs; + } return status; } diff --git a/src/drivers/ncmpio/ncmpio_open.c b/src/drivers/ncmpio/ncmpio_open.c index a24726ee90..c5540202ad 100644 --- a/src/drivers/ncmpio/ncmpio_open.c +++ b/src/drivers/ncmpio/ncmpio_open.c @@ -32,118 +32,295 @@ /*----< ncmpio_open() >------------------------------------------------------*/ int -ncmpio_open(MPI_Comm comm, - const char *path, - int omode, - int ncid, - MPI_Info user_info, /* user's and env info combined */ - void **ncpp) +ncmpio_open(MPI_Comm comm, + const char *path, + int omode, + int ncid, + int env_mode, + MPI_Info user_info, /* user's and env info combined */ + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) { - char *env_str, *mpi_name; - int i, mpiomode, err, status=NC_NOERR, mpireturn; - MPI_File fh; - MPI_Info info_used; + char *filename, value[MPI_MAX_INFO_VAL + 1], *mpi_name; + int i, rank, nprocs, mpiomode, err, status=NC_NOERR, mpireturn, flag; + int striping_unit; + MPI_File fh=MPI_FILE_NULL; NC *ncp=NULL; *ncpp = NULL; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &nprocs); + /* Note path's validity and omode consistency have been checked in - * ncmpi_open() in src/dispatchers/file.c and - * path consistency will be done in MPI_File_open */ + * ncmpi_open() in src/dispatchers/file.c and path consistency will be done + * in MPI_File_open. + */ /* First, check whether omode is valid or supported ---------------------*/ + /* NC_DISKLESS is not supported yet */ if (omode & NC_DISKLESS) DEBUG_RETURN_ERROR(NC_EINVAL_OMODE) /* NC_MMAP is not supported yet */ if (omode & NC_MMAP) DEBUG_RETURN_ERROR(NC_EINVAL_OMODE) -#if 0 && defined(HAVE_ACCESS) - if (mpiomode == MPI_MODE_RDONLY) { /* file should already exit */ - int rank, file_exist; - MPI_Comm_rank(comm, &rank); - if (rank == 0) { - if (access(path, F_OK) == 0) file_exist = 1; - else file_exist = 0; - } - TRACE_COMM(MPI_Bcast)(&file_exist, 1, MPI_INT, 0, comm); - if (!file_exist) DEBUG_RETURN_ERROR(NC_ENOENT) - } -#endif + /* allocate buffer for header object NC and initialize its contents */ + ncp = (NC*) NCI_Calloc(1, sizeof(NC)); + if (ncp == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM) - /* open file collectively ---------------------------------------------- */ - mpiomode = fIsSet(omode, NC_WRITE) ? MPI_MODE_RDWR : MPI_MODE_RDONLY; + *ncpp = (void*)ncp; - TRACE_IO(MPI_File_open, (comm, (char *)path, mpiomode, user_info, &fh)); - if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, mpi_name); + ncp->ncid = ncid; + ncp->comm = comm; /* reuse comm duplicated in dispatch layer */ + ncp->rank = rank; + ncp->nprocs = nprocs; + ncp->mpiinfo = MPI_INFO_NULL; + + /* Extract hints from user_info. Two hints must be extracted now in order + * to continue: + * nc_pncio: whether to user MPI-IO or PnetCDF's PNCIO driver. + * nc_num_aggrs_per_node: number of processes per node to be INA + * aggregators. + * + * ncp->fstype will be initialized in ncmpio_hint_extract() and set in + * PNCIO_FileSysType(). + */ + ncmpio_hint_extract(ncp, user_info); - /* get the file info used/modified by MPI-IO */ - TRACE_IO(MPI_File_get_info, (fh, &info_used)); - if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, mpi_name); + if (ncp->fstype == PNCIO_FSTYPE_CHECK) { + if (rank == 0) + /* Check file system type. If the given file does not exist, check + * its parent folder. Currently PnetCDF's PNCIO drivers support + * Lustre (PNCIO_LUSTRE) and Unix File System (PNCIO_UFS). + */ + ncp->fstype = PNCIO_FileSysType(path); - /* Now the file has been successfully opened, allocate/set NC object */ + MPI_Bcast(&ncp->fstype, 1, MPI_INT, 0, ncp->comm); + } - /* path's validity and omode consistency have been checked in ncmpi_open() - * in src/dispatchers/file.c */ +#ifdef WKL_DEBUG +if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == PNCIO_FSTYPE_MPIIO)? "PNCIO_FSTYPE_MPIIO" : (ncp->fstype == PNCIO_LUSTRE) ? "PNCIO_LUSTRE" : "PNCIO_UFS"); +#endif - /* allocate buffer for header object NC */ - ncp = (NC*) NCI_Calloc(1, sizeof(NC)); - if (ncp == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM) + /* Remove the file system type prefix name if there is any. For example, + * when path = "lustre:/home/foo/testfile.nc", remove "lustre:" to make + * filename pointing to "/home/foo/testfile.nc", so it can be used in POSIX + * access() below + */ + filename = ncmpii_remove_file_system_type_prefix(path); + + ncp->path = path; /* reuse path duplicated in dispatch layer */ + ncp->pncio_fh = NULL; + ncp->iomode = omode; + + ncp->collective_fh = MPI_FILE_NULL; + ncp->independent_fh = MPI_FILE_NULL; + + /* Setting file open mode in mpiomode which may later be needed in + * ncmpi_begin_indep_data() to open file for independent data mode. + */ + mpiomode = fIsSet(omode, NC_WRITE) ? MPI_MODE_RDWR : MPI_MODE_RDONLY; + ncp->mpiomode = mpiomode; /* PnetCDF default fill mode is no fill */ fClr(ncp->flags, NC_MODE_FILL); - if (!fIsSet(omode, NC_WRITE)) fSet(ncp->flags, NC_MODE_RDONLY); - ncp->ncid = ncid; + /* set read-only mode */ + if (!fIsSet(omode, NC_WRITE)) fSet(ncp->flags, NC_MODE_RDONLY); - /* chunk size for reading header (set default before check hints) */ - ncp->chunk = PNC_DEFAULT_CHUNKSIZE; + fSet(ncp->flags, env_mode); - /* buffer to pack noncontiguous user buffers when calling wait() */ - ncp->ibuf_size = PNC_DEFAULT_IBUF_SIZE; + /* node_ids stores a list of unique IDs of compute nodes of all MPI ranks + * in the MPI communicator passed from the user application. It is a keyval + * attribute cached in the communicator. See src/dispatchers/file.c for + * details. The node IDs will be used when the intra-node aggregation (INA) + * is enabled and when PnetCDF's PNCIO driver is used. + * + * When intra-node aggregation (INA) is enabled, node IDs are used to + * create a new MPI communicator consisting of the intra-node aggregators + * only. The communicator will be used to call file open in MPI-IO or + * PnetCDF's PNCIO driver. This means only intra-node aggregators will + * perform file I/O in PnetCDF collective put and get operations. + * + * node_ids will be used to calculate cb_nodes, the number of MPI-IO/PNCIO + * aggregators (not INA aggregators). + */ + ncp->node_ids = node_ids; - /* Extract PnetCDF specific I/O hints from user_info and set default hint - * values into info_used. Note some MPI libraries, such as MPICH 3.3.1 and - * priors fail to preserve user hints that are not recogniozed by the MPI - * libraries. + /* When the total number of aggregators >= number of processes, disable + * intra-node aggregation. */ - ncmpio_set_pnetcdf_hints(ncp, user_info, info_used); - - ncp->iomode = omode; - ncp->comm = comm; /* reuse comm duplicated in dispatch layer */ - MPI_Comm_rank(comm, &ncp->rank); - MPI_Comm_size(comm, &ncp->nprocs); - ncp->mpiinfo = info_used; /* is not MPI_INFO_NULL */ - ncp->mpiomode = mpiomode; - ncp->collective_fh = fh; - ncp->independent_fh = (ncp->nprocs > 1) ? MPI_FILE_NULL : fh; - ncp->path = (char*) NCI_Malloc(strlen(path) + 1); - strcpy(ncp->path, path); - -#ifdef PNETCDF_DEBUG - /* PNETCDF_DEBUG is set at configure time, which will be overwritten by - * the run-time environment variable PNETCDF_SAFE_MODE */ - ncp->safe_mode = 1; -#endif - /* If environment variable PNETCDF_SAFE_MODE is set to 1, then we perform - * a strict consistent test, i.e. arguments used in def_dim/def_var APIs + if (ncp->num_aggrs_per_node * node_ids.num_nodes >= ncp->nprocs) + ncp->num_aggrs_per_node = 0; + + /* ncp->num_aggrs_per_node = 0, or > 0 indicates whether this feature + * is disabled or enabled globally for all processes. */ - if ((env_str = getenv("PNETCDF_SAFE_MODE")) != NULL) { - if (*env_str == '0') ncp->safe_mode = 0; - else ncp->safe_mode = 1; - /* if PNETCDF_SAFE_MODE is set but without a value, *env_str can - * be '\0' (null character). In this case, safe_mode is enabled */ + ncp->my_aggr = -1; + ncp->ina_comm = MPI_COMM_NULL; + ncp->ina_nprocs = 0; + ncp->ina_rank = -1; + ncp->ina_node_list = NULL; + if (ncp->num_aggrs_per_node > 0) { + /* Must duplicate node_ids, as node_ids.ids[] will be modified by + * ncmpio_ina_init(). + */ + ncp->node_ids.ids = (int*) NCI_Malloc(sizeof(int) * ncp->nprocs); + memcpy(ncp->node_ids.ids, node_ids.ids, sizeof(int) * ncp->nprocs); + + /* Divide all ranks into groups. Each group is assigned with one + * intra-node aggregator. The following metadata related to intra-node + * aggregation will be set up. + * ncp->my_aggr is the aggregator's rank ID of this group. When == + * ncp->rank, this rank is an aggregator. + * ncp->num_nonaggrs is the number of non-aggregators assigned to this + * rank (an aggregator) + * ncp->ina_comm will be created consisting of only intra-node + * aggregators, which will be used when calling MPI_File_open(). + * For non-aggregator, ncp->ina_comm == MPI_COMM_NULL. + * ncp->node_ids.ids[] will be modified to contain the nodes IDs of + * intra-node aggregators only, which will be passed to pncio_fh. + */ + err = ncmpio_ina_init(ncp); + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err); + + /* As non-aggregators will not perform any file I/O, we now can replace + * comm with ina_comm. Same for nprocs. + */ + comm = ncp->ina_comm; + nprocs = ncp->ina_nprocs; + + /* For non-aggregators, comm is MPI_COMM_NULL. As the remaining task of + * this subroutine is to open the file and obtain the file handler, + * non-aggregators can skip. + */ + if (comm == MPI_COMM_NULL) { + if (user_info != MPI_INFO_NULL) + MPI_Info_dup(user_info, &ncp->mpiinfo); + goto fn_exit; + } } + /* open file collectively ---------------------------------------------- */ + if (ncp->fstype == PNCIO_FSTYPE_MPIIO) { + TRACE_IO(MPI_File_open, (comm, path, mpiomode, user_info, &fh)); + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + DEBUG_FOPEN_ERROR(err); + } + + /* Now the file has been successfully opened */ + ncp->collective_fh = fh; + ncp->independent_fh = (nprocs > 1) ? MPI_FILE_NULL : fh; + + /* get the I/O hints used/modified by MPI-IO */ + TRACE_IO(MPI_File_get_info, (fh, &ncp->mpiinfo)); + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + DEBUG_FOPEN_ERROR(err); + } + } + else { + /* When ncp->fstype != PNCIO_FSTYPE_MPIIO, use PnetCDF's PNCIO driver */ + ncp->pncio_fh = (PNCIO_File*) NCI_Calloc(1,sizeof(PNCIO_File)); + ncp->pncio_fh->file_system = ncp->fstype; + ncp->pncio_fh->node_ids = ncp->node_ids; + + err = PNCIO_File_open(comm, filename, mpiomode, user_info, + ncp->pncio_fh); + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err); + + /* Now the file has been successfully opened, obtain the I/O hints + * used/modified by PNCIO driver. + */ + err = PNCIO_File_get_info(ncp->pncio_fh, &ncp->mpiinfo); + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err); + } + +fn_exit: + striping_unit = -1; + + if (ncp->num_aggrs_per_node > 0) { + /* When intra-node aggregation is enabled, it is necessary to make sure + * non-aggregators obtain consistent values of file striping hints. + * + * non-aggregator do not have hints returned from MPI_File_get_info() + */ + int striping_info[2]; + if (ncp->rank == 0) { + MPI_Info_get(ncp->mpiinfo, "striping_unit", MPI_MAX_INFO_VAL-1, + value, &flag); + striping_info[0] = 0; + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + striping_info[0] = (int)strtol(value,NULL,10); + if (errno != 0) striping_info[0] = 0; + } + + MPI_Info_get(ncp->mpiinfo, "striping_factor", MPI_MAX_INFO_VAL-1, + value, &flag); + striping_info[1] = 0; + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + striping_info[1] = (int)strtol(value,NULL,10); + if (errno != 0) striping_info[1] = 0; + } + } + + MPI_Bcast(striping_info, 2, MPI_INT, 0, ncp->comm); + + if (ncp->my_aggr != ncp->rank) { + sprintf(value, "%d", striping_info[0]); + MPI_Info_set(ncp->mpiinfo, "striping_unit", value); + sprintf(value, "%d", striping_info[1]); + MPI_Info_set(ncp->mpiinfo, "striping_factor", value); + } + + striping_unit = striping_info[0]; + } + else { + MPI_Info_get(ncp->mpiinfo, "striping_unit", MPI_MAX_INFO_VAL-1, + value, &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + striping_unit = (int)strtol(value,NULL,10); + if (errno != 0) striping_unit = -1; + } + } + + if (ncp->data_chunk == -1) + /* if not set by user hint, nc_data_move_chunk_size */ + ncp->data_chunk = (striping_unit > 0) ? striping_unit + : PNC_DATA_MOVE_CHUNK_SIZE; + + /* Copy MPI-IO hints into ncp->mpiinfo */ + ncmpio_hint_set(ncp, ncp->mpiinfo); + + /* ina_node_list is no longer needed */ + if (ncp->ina_node_list != NULL) { + NCI_Free(ncp->ina_node_list); + ncp->ina_node_list = NULL; + } + if (ncp->num_aggrs_per_node > 0) { + /* node_ids is no longer needed. Note node_ids is duplicated above from + * the MPI communicator's cached keyval attribute when + * ncp->num_aggrs_per_node > 0. + */ + NCI_Free(ncp->node_ids.ids); + ncp->node_ids.ids = NULL; + } + if (ncp->pncio_fh != NULL) + ncp->pncio_fh->node_ids.ids = NULL; + /* read header from file into NC object pointed by ncp -------------------*/ err = ncmpio_hdr_get_NC(ncp); if (err == NC_ENULLPAD) status = NC_ENULLPAD; /* non-fatal error */ else if (err != NC_NOERR) { /* fatal error */ - ncmpio_close_files(ncp, 0); + ncmpio_file_close(ncp); + if (ncp->ina_comm != MPI_COMM_NULL) MPI_Comm_free(&ncp->ina_comm); ncmpio_free_NC(ncp); - return err; + DEBUG_RETURN_ERROR(err); } #ifdef ENABLE_SUBFILING @@ -152,29 +329,28 @@ ncmpio_open(MPI_Comm comm, err = ncmpio_get_att(ncp, NC_GLOBAL, "_PnetCDF_SubFiling.num_subfiles", &ncp->num_subfiles, MPI_INT); if (err == NC_NOERR && ncp->num_subfiles > 1) { - int i; /* ignore error NC_ENOTATT if this attribute is not defined */ for (i=0; ivars.ndefined; i++) { /* variables may have different numbers of subfiles */ err = ncmpio_get_att(ncp, i, "_PnetCDF_SubFiling.num_subfiles", &ncp->vars.value[i]->num_subfiles,MPI_INT); if (err == NC_ENOTATT) continue; - if (err != NC_NOERR) return err; + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err); if (ncp->vars.value[i]->num_subfiles > 1) { /* find the orginal ndims of variable i */ err = ncmpio_get_att(ncp,i,"_PnetCDF_SubFiling.ndims_org", &ncp->vars.value[i]->ndims_org,MPI_INT); - if (err != NC_NOERR) return err; + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err); ncp->vars.value[i]->dimids_org = (int*) NCI_Malloc( ncp->vars.value[i]->ndims_org * SIZEOF_INT); err = ncmpio_get_att(ncp,i,"_PnetCDF_SubFiling.dimids_org", ncp->vars.value[i]->dimids_org, MPI_INT); - if (err != NC_NOERR) return err; + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err); } } /* open subfile */ err = ncmpio_subfile_open(ncp); - if (err != NC_NOERR) return err; + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err); } else ncp->num_subfiles = 0; } @@ -191,21 +367,6 @@ ncmpio_open(MPI_Comm comm, ncp->vars.value[i]->attrs.hash_size = ncp->hash_size_attr; #endif - /* determine whether to enable intra-node aggregation and set up all - * intra-node aggregation metadata. - * ncp->num_aggrs_per_node = 0, or non-zero indicates whether this feature - * is enabled globally for all processes. - * ncp->my_aggr = -1 or >= 0 indicates whether aggregation is effectively - * enabled for the aggregation group of this process. - */ - ncp->my_aggr = -1; - if (ncp->num_aggrs_per_node != 0) { - err = ncmpio_intra_node_aggr_init(ncp); - if (err != NC_NOERR) return err; - } - - *ncpp = (void*)ncp; - return status; } diff --git a/src/drivers/ncmpio/ncmpio_subfile.c b/src/drivers/ncmpio/ncmpio_subfile.c index e1be70ec7c..c0d0bd8557 100644 --- a/src/drivers/ncmpio/ncmpio_subfile.c +++ b/src/drivers/ncmpio/ncmpio_subfile.c @@ -129,10 +129,12 @@ subfile_create(NC *ncp) MPI_Info_set(info, "romio_lustre_start_iodevice", offset); MPI_Info_set(info, "striping_factor", "1"); */ + ncmpii_construct_node_list(ncp->comm_sf, &ncp->node_ids_sf.num_nodes, + &ncp->node_ids_sf.ids); void *ncp_sf; - status = ncmpio_create(ncp->comm_sf, path_sf, ncp->iomode, ncp->ncid, info, - &ncp_sf); + status = ncmpio_create(ncp->comm_sf, path_sf, ncp->iomode, ncp->ncid, + ncp->flags, info, ncp->node_ids_sf, &ncp_sf); if (status != NC_NOERR && myrank == 0) fprintf(stderr, "%s: error in creating file(%s): %s\n", __func__, path_sf, ncmpi_strerror(status)); @@ -186,9 +188,12 @@ ncmpio_subfile_open(NC *ncp) /* sprintf(path_sf, "%s%d/%s", path, color, file); */ sprintf(path_sf, "%s.subfile_%i.%s", ncp->path, color, "nc"); + ncmpii_construct_node_list(ncp->comm_sf, &ncp->node_ids_sf.num_nodes, + &ncp->node_ids_sf.ids); + void *ncp_sf; status = ncmpio_open(ncp->comm_sf, path_sf, ncp->iomode, ncp->ncid, - MPI_INFO_NULL, &ncp_sf); + ncp->flags, MPI_INFO_NULL, ncp->node_ids_sf, &ncp_sf); ncp->ncp_sf = (NC*) ncp_sf; return status; @@ -200,6 +205,9 @@ int ncmpio_subfile_close(NC *ncp) int status = NC_NOERR; if (ncp->ncp_sf != NULL) { + if (ncp->node_ids_sf.ids != NULL) + free(ncp->node_ids_sf.ids); + status = ncmpio_close(ncp->ncp_sf); if (status != NC_NOERR) return status; ncp->ncp_sf = NULL; @@ -315,7 +323,7 @@ int ncmpio_subfile_partition(NC *ncp) if (dpp[vpp[i]->dimids[par_dim_id]]->size/ncp->num_subfiles > 0 && vpp[i]->ndims >= par_dim_id+1 && vpp[i]->ndims >= SUBFILING_MIN_NDIMS) { - int varid, j, jj, k; + int varid, jj, k; int var_ndims = vpp[i]->ndims; /* keep org ndims */ int dimids[var_ndims]; char *key[ncp->num_subfiles][var_ndims]; @@ -1003,7 +1011,6 @@ ncmpio_subfile_getput_vars(NC *ncp, for (i=0; i #include "ncmpio_NC.h" -/*----< ncmpio_file_sync() >-------------------------------------------------*/ -/* This function must be called collectively, no matter if it is in collective - * or independent data mode. - */ -int -ncmpio_file_sync(NC *ncp) { - char *mpi_name; - int mpireturn; - - if (ncp->independent_fh != MPI_FILE_NULL) { - TRACE_IO(MPI_File_sync, (ncp->independent_fh)); - if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, mpi_name); - } - /* when nprocs == 1, ncp->collective_fh == ncp->independent_fh */ - if (ncp->nprocs == 1) return NC_NOERR; - - /* ncp->collective_fh is never MPI_FILE_NULL as collective mode is - * default in PnetCDF */ - TRACE_IO(MPI_File_sync, (ncp->collective_fh)); - if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, mpi_name); - - /* Barrier is not necessary ... - TRACE_COMM(MPI_Barrier)(ncp->comm); - */ - - return NC_NOERR; -} - #define NC_NUMRECS_OFFSET 4 /*----< ncmpio_write_numrecs() >---------------------------------------------*/ -/* root process writes the new record number into file. +/* Only root process writes the new record number into file. * This function is called by: * 1. ncmpio_sync_numrecs * 2. collective nonblocking wait API, if the new number of records is bigger @@ -69,32 +39,47 @@ int ncmpio_write_numrecs(NC *ncp, MPI_Offset new_numrecs) { - char *mpi_name; - int mpireturn, err; - MPI_File fh; - MPI_Status mpistatus; + int err=NC_NOERR; + PNCIO_View buf_view; - if (!fIsSet(ncp->flags, NC_HCOLL) && ncp->rank > 0) - /* Only root process writes numrecs in file */ - return NC_NOERR; + buf_view.type = MPI_BYTE; + buf_view.size = 0; + buf_view.count = 1; + buf_view.is_contig = 1; - /* return now if there is no record variabled defined */ + /* return now if there is no record variable defined */ if (ncp->vars.num_rec_vars == 0) return NC_NOERR; - fh = ncp->independent_fh; - if (ncp->nprocs > 1 && !NC_indep(ncp)) - fh = ncp->collective_fh; + /* When intra-node aggregation is enabled, non-aggregators do not + * participate any collective calls below. + */ + if (ncp->num_aggrs_per_node > 0 && ncp->rank != ncp->my_aggr) + return NC_NOERR; - if (ncp->rank > 0 && fIsSet(ncp->flags, NC_HCOLL)) { - /* other processes participate the collective call */ - TRACE_IO(MPI_File_write_at_all, (fh, 0, NULL, 0, MPI_BYTE, &mpistatus)); - return (mpireturn == MPI_SUCCESS) ? NC_NOERR : - ncmpii_error_mpi2nc(mpireturn, mpi_name); + if (ncp->rank > 0) { + /* Currently in independent data mode */ + if (NC_indep(ncp)) + return NC_NOERR; + + /* If collective MPI-IO is required for all MPI-IO calls, then all + * non-root processes participate the collective write call with + * zero-size requests. + */ + if (fIsSet(ncp->flags, NC_HCOLL)) + ncmpio_file_write_at_all(ncp, 0, NULL, buf_view); + + /* If not requiring all MPI-IO calls to be collective, non-root + * processes can return now. This is because only root process writes + * numrecs to the file header. + */ + return NC_NOERR; } + /* codes below run by root only */ if (new_numrecs > ncp->numrecs || NC_ndirty(ncp)) { int len; char pos[8], *buf=pos; + MPI_Offset wlen; /* update ncp->numrecs */ if (new_numrecs > ncp->numrecs) ncp->numrecs = new_numrecs; @@ -113,41 +98,32 @@ ncmpio_write_numrecs(NC *ncp, } /* ncmpix_put_xxx advances the 1st argument with size len */ - /* explicitly initialize mpistatus object to 0. For zero-length read, - * MPI_Get_count may report incorrect result for some MPICH version, - * due to the uninitialized MPI_Status object passed to MPI-IO calls. - * Thus we initialize it above to work around. - */ - memset(&mpistatus, 0, sizeof(MPI_Status)); - - /* root's file view always includes the entire file header */ - if (fIsSet(ncp->flags, NC_HCOLL) && ncp->nprocs > 1) { - TRACE_IO(MPI_File_write_at_all, (fh, NC_NUMRECS_OFFSET, (void*)pos, - len, MPI_BYTE, &mpistatus)); - } - else { - TRACE_IO(MPI_File_write_at, (fh, NC_NUMRECS_OFFSET, (void*)pos, - len, MPI_BYTE, &mpistatus)); - } - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (err == NC_EFILE) DEBUG_RETURN_ERROR(NC_EWRITE) - } - else { - /* update the number of bytes written since file open. - * Because the above MPI write writes either 4 or 8 bytes, - * calling MPI_Get_count() is sufficient. No need to call - * MPI_Get_count_c() + if (ncp->num_aggrs_per_node > 0 && ncp->rank != ncp->my_aggr) + /* When intra-node aggregation is enabled, non-aggregators do not + * participate the collective call. */ - int put_size; - mpireturn = MPI_Get_count(&mpistatus, MPI_BYTE, &put_size); - if (mpireturn != MPI_SUCCESS || put_size == MPI_UNDEFINED) - ncp->put_size += len; - else - ncp->put_size += put_size; + return NC_NOERR; + + if (ncp->fstype != PNCIO_FSTYPE_MPIIO) { + /* reset fileview */ + err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL); + if (err != NC_NOERR) DEBUG_RETURN_ERROR(err) } + +// printf("%s at %d: new_numrecs=%lld NC_NUMRECS_OFFSET=%d\n",__func__,__LINE__,new_numrecs,NC_NUMRECS_OFFSET); + buf_view.size = len; + + /* root's file view always includes the entire file header */ + if (!NC_indep(ncp) && fIsSet(ncp->flags, NC_HCOLL) && ncp->nprocs > 1) + wlen = ncmpio_file_write_at_all(ncp, NC_NUMRECS_OFFSET, (void*)pos, + buf_view); + else + wlen = ncmpio_file_write_at(ncp, NC_NUMRECS_OFFSET, (void*)pos, + buf_view); + if (wlen < 0) + DEBUG_RETURN_ERROR((int)wlen) } - return NC_NOERR; + return err; } /*----< ncmpio_sync_numrecs() >-----------------------------------------------*/ @@ -199,10 +175,11 @@ ncmpio_sync_numrecs(void *ncdp) return ncmpii_error_mpi2nc(mpireturn, "MPI_Allreduce"); } +// printf("%s at %d: max_numrecs=%lld\n",__func__,__LINE__,max_numrecs); /* root process writes max_numrecs to file */ status = ncmpio_write_numrecs(ncp, max_numrecs); - if (ncp->nprocs > 1 && ncp->safe_mode == 1) { + if (ncp->nprocs > 1 && fIsSet(ncp->flags, NC_MODE_SAFE)) { /* broadcast root's status, because only root writes to the file */ int root_status = status; TRACE_COMM(MPI_Bcast)(&root_status, 1, MPI_INT, 0, ncp->comm); diff --git a/src/drivers/ncmpio/ncmpio_util.c b/src/drivers/ncmpio/ncmpio_util.c index 8034f9f0be..179bd7a4b6 100644 --- a/src/drivers/ncmpio/ncmpio_util.c +++ b/src/drivers/ncmpio/ncmpio_util.c @@ -18,267 +18,378 @@ #include #include +#include #include "ncmpio_NC.h" -/*----< ncmpio_set_pnetcdf_hints() >-----------------------------------------*/ -/* this is where the I/O hints designated to pnetcdf are extracted and their - * default values are set. +#define MAX_INT_LEN 24 + +/*----< ncmpio_hint_extract() >----------------------------------------------*/ +/* Extract hints from info. Argument info is the info object set by application + * user and passed to ncmpi_create() or ncmpi_open(). For those PnetCDF hints + * are not set in info, their default values are used. */ -void ncmpio_set_pnetcdf_hints(NC *ncp, - MPI_Info user_info, - MPI_Info info_used) +void ncmpio_hint_extract(NC *ncp, + MPI_Info info) { char value[MPI_MAX_INFO_VAL]; - int flag; + int flag, ival; + long long llval; + MPI_Offset var_align_val, hdr_align_val; - if (user_info == MPI_INFO_NULL) flag = 0; + assert(ncp != NULL); - /* Note info_used cannot be MPI_INFO_NULL, as it is returned from a call to - * MPI_File_get_info() - */ - assert(info_used != MPI_INFO_NULL); + ncp->info_v_align = -1; /* -1 indicates not set */ + ncp->info_r_align = -1; /* -1 indicates not set */ + + /* chunk size for reading header (set default before check hints) */ + ncp->hdr_chunk = PNC_HDR_READ_CHUNK_SIZE; + + /* chunk size for moving variables to higher offsets */ + ncp->data_chunk = -1; + + /* buffer to pack noncontiguous user buffers when calling wait() */ + ncp->ibuf_size = PNC_DEFAULT_IBUF_SIZE; + +#ifdef ENABLE_SUBFILING + ncp->subfile_mode = 0; + ncp->num_subfiles = 0; + ncp->node_ids_sf.num_nodes = 0; + ncp->node_ids_sf.ids = NULL; +#endif + + ncp->dims.hash_size = PNC_HSIZE_DIM; + ncp->vars.hash_size = PNC_HSIZE_VAR; + ncp->attrs.hash_size = PNC_HSIZE_GATTR; + ncp->hash_size_attr = PNC_HSIZE_VATTR; + + /* number of INA aggregators per compute node */ + ncp->num_aggrs_per_node = 0; + + /* file system type */ + ncp->fstype = PNCIO_FSTYPE_CHECK; + + if (info == MPI_INFO_NULL) return; /* nc_var_align_size, and r_align take effect when a file is created, or * opened and later adding more metadata or variable data */ - ncp->info_v_align = -1; /* -1 indicates not set */ - if (user_info != MPI_INFO_NULL) { - /* aligns starting file offsets of entire data section */ - MPI_Info_get(user_info, "nc_var_align_size", MPI_MAX_INFO_VAL-1, - value, &flag); - if (flag) { - errno = 0; /* errno must set to zero before calling strtoll */ - ncp->info_v_align = strtoll(value, NULL, 10); - if (errno != 0) ncp->info_v_align = -1; - else if (ncp->info_v_align < 0) ncp->info_v_align = -1; - } + /* aligns starting file offsets of entire data section */ + var_align_val = -1; + MPI_Info_get(info, "nc_var_align_size", MPI_MAX_INFO_VAL-1, value, &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + llval = strtoll(value, NULL, 10); + if (errno == 0 && llval >= 0) + var_align_val = llval; } - if (ncp->info_v_align == -1) - sprintf(value, "%d", FILE_ALIGNMENT_DEFAULT); - else - sprintf(value, OFFFMT, ncp->info_v_align); - MPI_Info_set(info_used, "nc_var_align_size", value); - if (user_info != MPI_INFO_NULL) { - /* Hint nc_header_align_size is now deprecated. But for backward - * compatibility, let's still check. - */ - MPI_Offset info_h_align = -1; - MPI_Info_get(user_info, "nc_header_align_size", MPI_MAX_INFO_VAL-1, - value, &flag); - if (flag) { - errno = 0; /* errno must set to zero before calling strtoll */ - info_h_align = strtoll(value, NULL, 10); - if (errno != 0) info_h_align = -1; - else if (info_h_align < 0) info_h_align = -1; - } - /* if nc_header_align_size is set and nc_var_align_size is not set, - * replace hint nc_var_align_size with the value of info_h_align. - */ - if (info_h_align >= 0 && ncp->info_v_align == -1) { - ncp->info_v_align = info_h_align; - sprintf(value, OFFFMT, ncp->info_v_align); - MPI_Info_set(info_used, "nc_var_align_size", value); + /* Hint nc_header_align_size is now deprecated. But for backward + * compatibility, let's still check. + */ + hdr_align_val = -1; + MPI_Info_get(info, "nc_header_align_size", MPI_MAX_INFO_VAL-1, + value, &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + llval = strtoll(value, NULL, 10); + if (errno == 0 && llval >= 0) { + /* if nc_header_align_size is set and nc_var_align_size is not set, + * replace hint nc_var_align_size with the value of info_h_align. + */ + if (llval >= 0 && ncp->info_v_align == -1) + hdr_align_val = llval; } } - ncp->info_r_align = -1; - if (user_info != MPI_INFO_NULL) { - /* aligns starting file offset of the record variable section */ - MPI_Info_get(user_info, "nc_record_align_size", MPI_MAX_INFO_VAL-1, - value, &flag); - if (flag) { - errno = 0; /* errno must set to zero before calling strtoll */ - ncp->info_r_align = strtoll(value, NULL, 10); - if (errno != 0) ncp->info_r_align = -1; - else if (ncp->info_r_align < 0) ncp->info_r_align = -1; - } + /* hint nc_var_align_size supersedes nc_header_align_size */ + if (var_align_val > 0) + ncp->info_v_align = var_align_val; + else if (hdr_align_val > 0) + ncp->info_v_align = hdr_align_val; + + /* aligns starting file offset of the record variable section */ + MPI_Info_get(info, "nc_record_align_size", MPI_MAX_INFO_VAL-1, + value, &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + llval = strtoll(value, NULL, 10); + if (errno == 0 && llval >= 0) + ncp->info_r_align = llval; } - if (ncp->info_r_align == -1) - sprintf(value, "%d", FILE_ALIGNMENT_DEFAULT); - else - sprintf(value, OFFFMT, ncp->info_r_align); - MPI_Info_set(info_used, "nc_record_align_size", value); - - ncp->chunk = PNC_DEFAULT_CHUNKSIZE; - if (user_info != MPI_INFO_NULL) { - /* header reading chunk size */ - MPI_Info_get(user_info, "nc_header_read_chunk_size", MPI_MAX_INFO_VAL-1, - value, &flag); - if (flag) { - int chunk; - errno = 0; /* errno must set to zero before calling strtoll */ - chunk = atoi(value); - if (errno != 0) ncp->chunk = 0; - else if (ncp->chunk < 0) - ncp->chunk = 0; - else if (chunk > NC_MAX_INT) /* limit to NC_MAX_INT */ - ncp->chunk = NC_MAX_INT; + + /* header reading chunk size */ + MPI_Info_get(info, "nc_header_read_chunk_size", MPI_MAX_INFO_VAL-1, + value, &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + llval = strtoll(value, NULL, 10); + if (errno == 0) { + if (llval < 0) + ncp->hdr_chunk = PNC_HDR_READ_CHUNK_SIZE; + else if (llval > NC_MAX_INT) /* limit to NC_MAX_INT */ + ncp->hdr_chunk = NC_MAX_INT; + else + ncp->hdr_chunk = (int)llval; + + /* CDF-5's minimum header size is 4 bytes more than CDF-1 2's */ + ncp->hdr_chunk = PNETCDF_RNDUP(MAX(MIN_NC_XSZ+4, ncp->hdr_chunk), + X_ALIGN); } } - sprintf(value, "%d", ncp->chunk); - MPI_Info_set(info_used, "nc_header_read_chunk_size", value); - strcpy(value, "auto"); - if (user_info != MPI_INFO_NULL) { - /* setting in-place byte swap (matters only for Little Endian) */ - MPI_Info_get(user_info, "nc_in_place_swap", MPI_MAX_INFO_VAL-1, value, &flag); - if (flag) { - if (strcasecmp(value, "enable") == 0) { - fClr(ncp->flags, NC_MODE_SWAP_OFF); - fSet(ncp->flags, NC_MODE_SWAP_ON); - } - else if (strcasecmp(value, "disable") == 0) { - fClr(ncp->flags, NC_MODE_SWAP_ON); - fSet(ncp->flags, NC_MODE_SWAP_OFF); - } - else if (strcasecmp(value, "auto") == 0) { - fClr(ncp->flags, NC_MODE_SWAP_ON); - fClr(ncp->flags, NC_MODE_SWAP_OFF); - } + /* setting in-place byte swap (matters only for Little Endian) */ + MPI_Info_get(info, "nc_in_place_swap", MPI_MAX_INFO_VAL-1, value, &flag); + if (flag) { + if (strcasecmp(value, "enable") == 0) { + fClr(ncp->flags, NC_MODE_SWAP_OFF); + fSet(ncp->flags, NC_MODE_SWAP_ON); + } + else if (strcasecmp(value, "disable") == 0) { + fClr(ncp->flags, NC_MODE_SWAP_ON); + fSet(ncp->flags, NC_MODE_SWAP_OFF); + } + else if (strcasecmp(value, "auto") == 0) { + fClr(ncp->flags, NC_MODE_SWAP_ON); + fClr(ncp->flags, NC_MODE_SWAP_OFF); } } - MPI_Info_set(info_used, "nc_in_place_swap", value); - if (user_info != MPI_INFO_NULL) { - /* temporal buffer size used to pack noncontiguous aggregated user - * buffers when calling ncmpi_wait/wait_all, Default 16 MiB - */ - MPI_Info_get(user_info, "nc_ibuf_size", MPI_MAX_INFO_VAL-1, value, - &flag); - if (flag) { - MPI_Offset ibuf_size; - errno = 0; /* errno must set to zero before calling strtoll */ - ibuf_size = strtoll(value, NULL, 10); - if (errno == 0 && ibuf_size >= 0) ncp->ibuf_size = ibuf_size; - } + /* Temporal buffer size used to pack non-contiguous aggregated user buffers + * when calling ncmpi_wait/wait_all. Default PNC_DEFAULT_IBUF_SIZE. + */ + MPI_Info_get(info, "nc_ibuf_size", MPI_MAX_INFO_VAL-1, value, &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + llval = strtoll(value, NULL, 10); + if (errno == 0 && llval >= 0) + ncp->ibuf_size = llval; } - sprintf(value, OFFFMT, ncp->ibuf_size); - MPI_Info_set(info_used, "nc_ibuf_size", value); #ifdef ENABLE_SUBFILING - ncp->subfile_mode = 0; - if (user_info != MPI_INFO_NULL) { - MPI_Info_get(user_info, "pnetcdf_subfiling", MPI_MAX_INFO_VAL-1, - value, &flag); - if (flag) { - if (strcasecmp(value, "enable") == 0) - ncp->subfile_mode = 1; + MPI_Info_get(info, "pnetcdf_subfiling", MPI_MAX_INFO_VAL-1, value, &flag); + if (flag) { + if (strcasecmp(value, "enable") == 0) + ncp->subfile_mode = 1; + else { + ncp->subfile_mode = 0; + ncp->num_subfiles = 0; } } - if (ncp->subfile_mode) - MPI_Info_set(info_used, "pnetcdf_subfiling", "enable"); - else - MPI_Info_set(info_used, "pnetcdf_subfiling", "disable"); - ncp->num_subfiles = 0; - if (user_info != MPI_INFO_NULL) { - MPI_Info_get(user_info, "nc_num_subfiles", MPI_MAX_INFO_VAL-1, - value, &flag); + if (ncp->subfile_mode == 1) { + MPI_Info_get(info, "nc_num_subfiles", MPI_MAX_INFO_VAL-1, value, &flag); if (flag) { - errno = 0; - ncp->num_subfiles = atoi(value); - if (errno != 0) ncp->num_subfiles = 0; - else if (ncp->num_subfiles < 0) ncp->num_subfiles = 0; + errno = 0; /* errno must set to zero before calling atoi */ + ival = atoi(value); + if (errno == 0 && ival >= 0) + ncp->num_subfiles = ival; } } - sprintf(value, "%d", ncp->num_subfiles); - MPI_Info_set(info_used, "nc_num_subfiles", value); - - if (ncp->subfile_mode == 0) ncp->num_subfiles = 0; -#else - MPI_Info_set(info_used, "pnetcdf_subfiling", "disable"); - MPI_Info_set(info_used, "nc_num_subfiles", "0"); #endif - if (user_info != MPI_INFO_NULL) { - /* If romio_no_indep_rw is set to true, let all processes participate - * the read/write file header using MPI collective APIs, where only - * rank 0 has non-zero request count. - */ - MPI_Info_get(user_info, "romio_no_indep_rw", MPI_MAX_INFO_VAL-1, - value, &flag); - if (flag) { - if (strcasecmp(value, "true") == 0) - fSet((ncp)->flags, NC_HCOLL); - } + /* Hash table size for dimensions */ + MPI_Info_get(info, "nc_hash_size_dim", MPI_MAX_INFO_VAL-1, value, &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling atoi */ + ival = atoi(value); + if (errno == 0 && ival >= 0) + ncp->dims.hash_size = ival; } - ncp->dims.hash_size = PNC_HSIZE_DIM; - if (user_info != MPI_INFO_NULL) { - /* Hash table size for dimensions */ - MPI_Info_get(user_info, "nc_hash_size_dim", MPI_MAX_INFO_VAL-1, - value, &flag); - if (flag) { - errno = 0; /* errno must set to zero before calling atoi */ - ncp->dims.hash_size = atoi(value); - if (errno != 0 || ncp->dims.hash_size < 0) - ncp->dims.hash_size = PNC_HSIZE_DIM; - } + /* Hash table size for variables */ + MPI_Info_get(info, "nc_hash_size_var", MPI_MAX_INFO_VAL-1, value, &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling atoi */ + ival = atoi(value); + if (errno == 0 && ival >= 0) + ncp->vars.hash_size = ival; } - sprintf(value, "%d", ncp->dims.hash_size); - MPI_Info_set(info_used, "nc_hash_size_dim", value); - ncp->vars.hash_size = PNC_HSIZE_VAR; - if (user_info != MPI_INFO_NULL) { - /* Hash table size for variables */ - MPI_Info_get(user_info, "nc_hash_size_var", MPI_MAX_INFO_VAL-1, - value, &flag); - if (flag) { - errno = 0; /* errno must set to zero before calling atoi */ - ncp->vars.hash_size = atoi(value); - if (errno != 0 || ncp->vars.hash_size < 0) - ncp->vars.hash_size = PNC_HSIZE_VAR; - } + /* Hash table size for global attributes */ + MPI_Info_get(info, "nc_hash_size_gattr", MPI_MAX_INFO_VAL-1, value, &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling atoi */ + ival = atoi(value); + if (errno == 0 && ival >= 0) + ncp->attrs.hash_size = ival; } - sprintf(value, "%d", ncp->vars.hash_size); - MPI_Info_set(info_used, "nc_hash_size_var", value); - ncp->attrs.hash_size = PNC_HSIZE_GATTR; - if (user_info != MPI_INFO_NULL) { - /* Hash table size for global attributes */ - MPI_Info_get(user_info, "nc_hash_size_gattr", MPI_MAX_INFO_VAL-1, - value, &flag); + /* Hash table size for non-global attributes */ + MPI_Info_get(info, "nc_hash_size_vattr", MPI_MAX_INFO_VAL-1, value, &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling atoi */ + ival = atoi(value); + if (errno == 0 && ival >= 0) + ncp->hash_size_attr = ival; + } + + /* Number of intra-node aggregators per compute node. */ + if (ncp->nprocs > 1) { + MPI_Info_get(info, "nc_num_aggrs_per_node", MPI_MAX_INFO_VAL-1, value, + &flag); if (flag) { errno = 0; /* errno must set to zero before calling atoi */ - ncp->attrs.hash_size = atoi(value); - if (errno != 0 || ncp->attrs.hash_size < 0) - ncp->attrs.hash_size = PNC_HSIZE_GATTR; + ival = atoi(value); + if (errno == 0 && ival >= 0) + ncp->num_aggrs_per_node = ival; } } - sprintf(value, "%d", ncp->attrs.hash_size); - MPI_Info_set(info_used, "nc_hash_size_gattr", value); - ncp->hash_size_attr = PNC_HSIZE_VATTR; - if (user_info != MPI_INFO_NULL) { - /* Hash table size for non-global attributes */ - MPI_Info_get(user_info, "nc_hash_size_vattr", MPI_MAX_INFO_VAL-1, - value, &flag); - if (flag) { - errno = 0; /* errno must set to zero before calling atoi */ - ncp->hash_size_attr = atoi(value); - if (errno != 0 || ncp->hash_size_attr < 0) - ncp->hash_size_attr = PNC_HSIZE_VATTR; + /* If user explicitly want to use MPI-IO instead of PnetCDF's internal PNCIO + * driver, then set PnetCDF I/O hint "nc_pncio" to "disable". + */ + MPI_Info_get(info, "nc_pncio", MPI_MAX_INFO_VAL-1, value, &flag); + if (flag && strcasecmp(value, "disable") == 0) + ncp->fstype = PNCIO_FSTYPE_MPIIO; + + /* Check if user explicitly want all MPI-IO to be collective. */ + MPI_Info_get(info, "romio_no_indep_rw", MPI_MAX_INFO_VAL-1, value, &flag); + if (flag && strcasecmp(value, "true") == 0) + fSet(ncp->flags, NC_HCOLL); + + /* Data movement chunk size when variables need to be moved to higher file + * offsets. + */ + MPI_Info_get(info, "nc_data_move_chunk_size", MPI_MAX_INFO_VAL-1, value, + &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + llval = strtoll(value, NULL, 10); + if (errno == 0) { + if (llval < 0) + ncp->data_chunk = -1; + else if (llval > NC_MAX_INT) /* limit to NC_MAX_INT */ + ncp->data_chunk = NC_MAX_INT; + else + ncp->data_chunk = (int)llval; } } - sprintf(value, "%d", ncp->hash_size_attr); - MPI_Info_set(info_used, "nc_hash_size_vattr", value); - ncp->num_aggrs_per_node = 0; - if (user_info != MPI_INFO_NULL) { - /* Hash table size for non-global attributes */ - MPI_Info_get(user_info, "nc_num_aggrs_per_node", MPI_MAX_INFO_VAL-1, - value, &flag); - if (flag) { - errno = 0; /* errno must set to zero before calling atoi */ - ncp->num_aggrs_per_node = atoi(value); - if (errno != 0 || ncp->num_aggrs_per_node < 0) - ncp->num_aggrs_per_node = 0; + /* When creating a file, inherit file striping from the parent folder or + * let PnetCDF to decide. + */ + ncp->nc_striping = PNCIO_STRIPING_AUTO; + MPI_Info_get(info, "nc_striping", MPI_MAX_INFO_VAL-1, value, &flag); + if (flag && strcasecmp(value, "inherit") == 0) + ncp->nc_striping = PNCIO_STRIPING_INHERIT; +} + +/*----< ncmpio_hint_set() >--------------------------------------------------*/ +/* Insert PnetCDF hints into info. Argument info is the info object returned + * from an earlier call to MPI_File_get_info(). + */ +void ncmpio_hint_set(NC *ncp, + MPI_Info info) +{ + char int_str[MAX_INT_LEN]; + + assert(ncp != NULL); + assert(info != MPI_INFO_NULL); + + /* nc_var_align_size, and r_align take effect when a file is created, or + * opened and later adding more metadata or variable data + */ + + /* aligns starting file offsets of entire data section */ + if (ncp->info_v_align != -1) { + snprintf(int_str, MAX_INT_LEN, OFFFMT, ncp->info_v_align); + MPI_Info_set(info, "nc_var_align_size", int_str); + } + + /* aligns starting file offset of the record variable section */ + if (ncp->info_r_align != -1) { + snprintf(int_str, MAX_INT_LEN, OFFFMT, ncp->info_r_align); + MPI_Info_set(info, "nc_record_align_size", int_str); + } + + /* header reading chunk size */ + snprintf(int_str, MAX_INT_LEN, "%d", ncp->hdr_chunk); + MPI_Info_set(info, "nc_header_read_chunk_size", int_str); + + /* variable movement chunk size */ + snprintf(int_str, MAX_INT_LEN, "%d", ncp->data_chunk); + MPI_Info_set(info, "nc_data_move_chunk_size", int_str); + + /* setting in-place byte swap (matters only for Little Endian) */ + int swap_on = fIsSet(ncp->flags, NC_MODE_SWAP_ON); + int swap_off = fIsSet(ncp->flags, NC_MODE_SWAP_OFF); + if (!swap_on && !swap_off) + MPI_Info_set(info, "nc_in_place_swap", "auto"); + else if (swap_on) + MPI_Info_set(info, "nc_in_place_swap", "enable"); + else + MPI_Info_set(info, "nc_in_place_swap", "disable"); + + /* Temporal buffer size used to pack non-contiguous aggregated user buffers + * when calling ncmpi_wait/wait_all. Default PNC_DEFAULT_IBUF_SIZE. + */ + snprintf(int_str, MAX_INT_LEN, OFFFMT, ncp->ibuf_size); + MPI_Info_set(info, "nc_ibuf_size", int_str); + +#ifdef ENABLE_SUBFILING + if (ncp->subfile_mode) + MPI_Info_set(info, "pnetcdf_subfiling", "enable"); + else + MPI_Info_set(info, "pnetcdf_subfiling", "disable"); + + snprintf(int_str, MAX_INT_LEN, "%d", ncp->num_subfiles); + MPI_Info_set(info, "nc_num_subfiles", int_str); +#endif + + /* Hash table size for dimensions */ + snprintf(int_str, MAX_INT_LEN, "%d", ncp->dims.hash_size); + MPI_Info_set(info, "nc_hash_size_dim", int_str); + + /* Hash table size for variables */ + snprintf(int_str, MAX_INT_LEN, "%d", ncp->vars.hash_size); + MPI_Info_set(info, "nc_hash_size_var", int_str); + + /* Hash table size for global attributes */ + snprintf(int_str, MAX_INT_LEN, "%d", ncp->attrs.hash_size); + MPI_Info_set(info, "nc_hash_size_gattr", int_str); + + /* Hash table size for non-global attributes */ + snprintf(int_str, MAX_INT_LEN, "%d", ncp->hash_size_attr); + MPI_Info_set(info, "nc_hash_size_vattr", int_str); + + /* Whether using MPI-IO instead of PnetCDF's internal PNCIO driver. */ + if (ncp->fstype == PNCIO_FSTYPE_MPIIO) + MPI_Info_set(info, "nc_pncio", "disable"); + else + MPI_Info_set(info, "nc_pncio", "enable"); + + if (ncp->num_aggrs_per_node > 0) { + /* Number of intra-node aggregators per compute node. */ + snprintf(int_str, MAX_INT_LEN, "%d", ncp->num_aggrs_per_node); + MPI_Info_set(info, "nc_num_aggrs_per_node", int_str); + + /* Add hint "ina_node_list", list of INA aggregators' rank IDs */ + if (ncp->ina_node_list != NULL) { + char value[MPI_MAX_INFO_VAL]; + int i; + snprintf(value, MAX_INT_LEN, "%d", ncp->ina_node_list[0]); + for (i=1; iina_nprocs; i++) { + snprintf(int_str, sizeof(int_str), " %d", ncp->ina_node_list[i]); + if (strlen(value) + strlen(int_str) >= MPI_MAX_INFO_VAL-5) { + strcat(value, " ..."); + break; + } + strcat(value, int_str); + } + MPI_Info_set(info, "nc_ina_node_list", value); } } - sprintf(value, "%d", ncp->num_aggrs_per_node); - MPI_Info_set(info_used, "nc_num_aggrs_per_node", value); + else /* Update hint "num_aggrs_per_node" to indicate disabled. */ + MPI_Info_set(info, "nc_num_aggrs_per_node", "0"); + + /* When creating a file, inherit file striping from the parent folder or + * let PnetCDF to decide. + */ + if (ncp->nc_striping == PNCIO_STRIPING_AUTO) + MPI_Info_set(info, "nc_striping", "auto"); + else + MPI_Info_set(info, "nc_striping", "inherit"); } /*----< ncmpio_first_offset() >-----------------------------------------------*/ @@ -730,12 +841,12 @@ ncmpio_unpack_xbuf(int fmt, /* NC_FORMAT_CDF2 NC_FORMAT_CDF5 etc. */ break; } /* The only error codes returned from the above switch block are - * NC_EBADTYPE or NC_ERANGE. Bad varp->xtype and itype have been sanity - * checked at the dispatchers, so NC_EBADTYPE is not possible. Thus, - * the only possible error is NC_ERANGE. NC_ERANGE can be caused by - * one or more elements of buf that is out of range representable by - * the external data type, it is not considered a fatal error. This - * request must continue to finish. + * NC_EBADTYPE or NC_ERANGE. Bad varp->xtype and itype have been sanity + * checked at the dispatchers, so NC_EBADTYPE is not possible. Thus, + * the only possible error is NC_ERANGE. NC_ERANGE can be caused by + * one or more elements of buf that is out of range representable by + * the external data type, it is not considered a fatal error. This + * request must continue to finish. */ } else { @@ -785,30 +896,36 @@ ncmpio_unpack_xbuf(int fmt, /* NC_FORMAT_CDF2 NC_FORMAT_CDF5 etc. */ MPI_Type_free(&imaptype); } - /* unpacked lbuf into buf based on buftype -----------------------------*/ - if (!buftype_is_contig && lbuf != buf) { - /* no need unpack when buftype is used in MPI_File_read (lbuf == buf) */ + /* Unpacked lbuf into buf based on buftype. Note no need to unpack when + * buftype is used in MPI_File_read, i.e. lbuf == buf. + */ + if (lbuf != buf) { + if (buftype_is_contig) + memcpy(buf, lbuf, ibuf_size); + else { /* buftye is not contiguous */ #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count position = 0; - mpireturn = MPI_Unpack_c(lbuf, (MPI_Count)ibuf_size, &position, buf, - (MPI_Count)bufcount, buftype, MPI_COMM_SELF); - if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, "MPI_Unpack_c"); -#else - if (bufcount > NC_MAX_INT) { - if (err == NC_NOERR) - DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) - } - else { - int position = 0; - if (ibuf_size > NC_MAX_INT) - DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) - mpireturn = MPI_Unpack(lbuf, (int)ibuf_size, &position, buf, - (int)bufcount, buftype, MPI_COMM_SELF); + MPI_Count position = 0; + mpireturn = MPI_Unpack_c(lbuf, (MPI_Count)ibuf_size, &position, + buf, (MPI_Count)bufcount, buftype, + MPI_COMM_SELF); if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, "MPI_Unpack"); - } + return ncmpii_error_mpi2nc(mpireturn, "MPI_Unpack_c"); +#else + if (bufcount > NC_MAX_INT) { + if (err == NC_NOERR) + DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) + } + else { + int position = 0; + if (ibuf_size > NC_MAX_INT) + DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) + mpireturn = MPI_Unpack(lbuf, (int)ibuf_size, &position, buf, + (int)bufcount, buftype, MPI_COMM_SELF); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, "MPI_Unpack"); + } #endif + } } if (free_cbuf) NCI_Free(cbuf); if (free_lbuf) NCI_Free(lbuf); @@ -816,3 +933,136 @@ ncmpio_unpack_xbuf(int fmt, /* NC_FORMAT_CDF2 NC_FORMAT_CDF5 etc. */ return err; } +/*----< ncmpio_calc_off() >--------------------------------------------------*/ +/* Returns the starting file offset of a subarray request. + * Note zero-length request should never call this subroutine. + * Only a single offset-length pair will call this subroutine. + */ +int +ncmpio_calc_off(const NC *ncp, + const NC_var *varp, + const MPI_Offset *start, /* [varp->ndims] */ + MPI_Offset *offset) /* OUT: start offset */ +{ + int i, ndims = varp->ndims; /* number of dimensions of this variable */ + + /* + * varp->dsizes[] is computed from right to left product of shape + * For example, a 3D array of size 5x4x3 in C order, + * For fixed-size variable: dsizes[0]=60 dsizes[1]=12 dsizes[2]=3 + * For record variable: dsizes[0]=12 dsizes[1]=12 dsizes[2]=3 + */ + if (IS_RECVAR(varp)) { + *offset = 0; + if (ndims > 1) { + /* start from the least significant dimension */ + *offset = start[ndims-1]; + /* the remaining dimensions */ + for (i=ndims-2; i>0; i--) + *offset += start[i]*varp->dsizes[i+1]; + } + *offset *= varp->xsz; /* offset in bytes */ + } + else { + /* first handle the least significant dimension */ + *offset = start[ndims-1]; + /* remaining dimensions till the most significant dimension */ + for (i=ndims-2; i>=0; i--) + *offset += start[i] * varp->dsizes[i+1]; + *offset *= varp->xsz; /* offset in bytes */ + } + + return NC_NOERR; +} + +/*----< ncmpio_calc_start_end() >--------------------------------------------*/ +/* Returns the file offsets of access range of this request: starting file + * offset and end offset (exclusive). + * Note zero-length request should never call this subroutine. + */ +int +ncmpio_calc_start_end(const NC *ncp, + const NC_var *varp, + const MPI_Offset *start, /* [varp->ndims] */ + const MPI_Offset *count, /* [varp->ndims] */ + const MPI_Offset *stride, /* [varp->ndims] */ + MPI_Offset *start_off, /* OUT: start offset */ + MPI_Offset *end_off) /* OUT: end offset */ +{ + int i, ndims = varp->ndims; /* number of dimensions of this variable */ + + /* + * varp->dsizes[] is computed from right to left product of shape + * For example, a 3D array of size 5x4x3 in C order, + * For fixed-size variable: dsizes[0]=60 dsizes[1]=12 dsizes[2]=3 + * For record variable: dsizes[0]=12 dsizes[1]=12 dsizes[2]=3 + */ + if (IS_RECVAR(varp)) { + *start_off = 0; + *end_off = 0; + if (stride == NULL) { + if (ndims > 1) { + /* least significant dimension */ + *start_off = start[ndims-1]; + *end_off = start[ndims-1]+(count[ndims-1]-1); + /* the remaining dimensions */ + for (i=ndims-2; i>0; i--) { + *start_off += start[i]*varp->dsizes[i+1]; + *end_off += (start[i]+(count[i]-1))*varp->dsizes[i+1]; + } + } + *start_off *= varp->xsz; /* offset in bytes */ + *end_off *= varp->xsz; + /* handle the unlimited, most significant dimension */ + *start_off += start[0] * ncp->recsize; + *end_off += (start[0]+(count[0]-1)) * ncp->recsize; + } + else { + if (ndims > 1) { + /* least significant dimension */ + *start_off = start[ndims-1]; + *end_off = start[ndims-1]+(count[ndims-1]-1)*stride[ndims-1]; + /* the remaining dimensions */ + for (i=ndims-2; i>0; i--) { + *start_off += start[i]*varp->dsizes[i+1]; + *end_off += (start[i]+(count[i]-1)*stride[i]) * + varp->dsizes[i+1]; + } + } + *start_off *= varp->xsz; /* offset in bytes */ + *end_off *= varp->xsz; + /* handle the unlimited, most significant dimension */ + *start_off += start[0] * ncp->recsize; + *end_off += (start[0]+(count[0]-1)*stride[0]) * ncp->recsize; + } + } + else { + if (stride == NULL) { + /* first handle the least significant dimension */ + *start_off = start[ndims-1]; + *end_off = start[ndims-1] + (count[ndims-1]-1); + /* remaining dimensions till the most significant dimension */ + for (i=ndims-2; i>=0; i--) { + *start_off += start[i] * varp->dsizes[i+1]; + *end_off += (start[i]+(count[i]-1)) * varp->dsizes[i+1]; + } + } + else { + /* first handle the least significant dimension */ + *start_off = start[ndims-1]; + *end_off = start[ndims-1]+(count[ndims-1]-1)*stride[ndims-1]; + /* remaining dimensions till the most significant dimension */ + for (i=ndims-2; i>=0; i--) { + *start_off += start[i] * varp->dsizes[i+1]; + *end_off += (start[i]+(count[i]-1)*stride[i])*varp->dsizes[i+1]; + } + } + *start_off *= varp->xsz; /* offset in bytes */ + *end_off *= varp->xsz; + } + *start_off += varp->begin; /* beginning file offset of this variable */ + *end_off += varp->begin + varp->xsz; + + return NC_NOERR; +} + diff --git a/src/drivers/ncmpio/ncmpio_var.c b/src/drivers/ncmpio/ncmpio_var.c index df66a52c0c..09cb3a0b58 100644 --- a/src/drivers/ncmpio/ncmpio_var.c +++ b/src/drivers/ncmpio/ncmpio_var.c @@ -415,7 +415,7 @@ ncmpio_def_var(void *ncdp, ncp->vars.ndefined++; err_check: - if (ncp->safe_mode && ncp->nprocs > 1) { + if (fIsSet(ncp->flags, NC_MODE_SAFE) && ncp->nprocs > 1) { int minE, mpireturn; /* First check the error code across processes */ @@ -600,7 +600,7 @@ ncmpio_rename_var(void *ncdp, #endif err_check: - if (ncp->safe_mode && ncp->nprocs > 1) { + if (fIsSet(ncp->flags, NC_MODE_SAFE) && ncp->nprocs > 1) { int minE, mpireturn; /* First check error code so far across processes */ diff --git a/src/drivers/ncmpio/ncmpio_vard.c b/src/drivers/ncmpio/ncmpio_vard.c index 7f3fe12248..ac032d1ace 100644 --- a/src/drivers/ncmpio/ncmpio_vard.c +++ b/src/drivers/ncmpio/ncmpio_vard.c @@ -55,9 +55,8 @@ getput_vard(NC *ncp, void *xbuf=NULL; int mpireturn, status=NC_NOERR, err=NC_NOERR, xtype_is_contig=1; int el_size, buftype_is_contig=0, need_swap_back_buf=0; - int need_convert=0, need_swap=0, coll_indep, rw_flag; - MPI_File fh; - MPI_Offset nelems=0, fnelems=0, bnelems=0, offset=0; + int need_convert=0, need_swap=0; + MPI_Offset fnelems=0, bnelems=0, offset=0; MPI_Datatype etype=MPI_DATATYPE_NULL, xtype=MPI_BYTE; MPI_Offset filetype_size=0; #ifdef HAVE_MPI_TYPE_SIZE_C @@ -71,6 +70,17 @@ getput_vard(NC *ncp, int type_size; #endif + if (ncp->fstype != PNCIO_FSTYPE_MPIIO) { + fprintf(stderr, "PnetCDF vard APIs are only supported when using MPI-IO.\n"); + fprintf(stderr, "Please set environment variable PNETCDF_HINTS to \"nc_pncio=disable\"\n"); + return NC_ENOTSUPPORT; + } + + if (ncp->num_aggrs_per_node > 0) { + fprintf(stderr, "PnetCDF vard APIs are not supported when intra-node agggregation is enabled\n"); + return NC_ENOTSUPPORT; + } + #ifdef ENABLE_SUBFILING /* call a separate routine if variable is stored in subfiles */ if (varp->num_subfiles > 1) { @@ -170,7 +180,7 @@ getput_vard(NC *ncp, bnelems = bufcount; } else { - /* find the element type of filetype. ncmpii_dtype_decode() checks + /* find the element type of buftype. ncmpii_dtype_decode() checks * NC_EMULTITYPES */ err = ncmpii_dtype_decode(buftype, &etype, &el_size, &bnelems, NULL, &buftype_is_contig); @@ -214,8 +224,8 @@ getput_vard(NC *ncp, } } - if (!need_convert && - (!need_swap || (can_swap_in_place && buftype_is_contig))) { + if (!need_convert && buftype_is_contig && + (!need_swap || can_swap_in_place)) { /* reuse buftype, bufcount, buf in later MPI file write */ xbuf = buf; if (need_swap) { @@ -246,7 +256,7 @@ getput_vard(NC *ncp, } } else { /* read request */ - if (!need_convert && (!need_swap || buftype_is_contig)) { + if (!need_convert && !need_swap && buftype_is_contig) { /* reuse buftype, bufcount, buf in later MPI file read */ xbuf = buf; } @@ -259,18 +269,7 @@ getput_vard(NC *ncp, xtype_is_contig = 1; } } - - /* Set nelems and xtype which will be used in MPI read/write */ - if (buf != xbuf) { - /* xbuf is a malloc-ed contiguous buffer */ - nelems = bnelems; - } - else { - /* we can safely use bufcount and buftype in MPI File read/write. - * Note buftype may be noncontiguous. */ - nelems = bufcount; - xtype = buftype; - } +assert(xtype_is_contig == 1); /* set fileview's displacement to the variable's starting file offset */ offset = varp->begin; @@ -296,7 +295,6 @@ getput_vard(NC *ncp, */ offset = 0; bufcount = 0; - nelems = 0; filetype_size = 0; filetype = MPI_BYTE; buftype = MPI_BYTE; @@ -305,31 +303,79 @@ getput_vard(NC *ncp, } status = err; + /* set the MPI-IO fileview, this is a collective call */ +#if 1 + /* vard API is only supported when using MPI-IO, not PNCIO */ + char *mpi_name; + MPI_File fh; + /* when ncp->nprocs == 1, ncp->collective_fh == ncp->independent_fh */ - fh = ncp->independent_fh; - coll_indep = NC_REQ_INDEP; - if (ncp->nprocs > 1 && fIsSet(reqMode, NC_REQ_COLL)) { - fh = ncp->collective_fh; - coll_indep = NC_REQ_COLL; - } + fh = (ncp->nprocs > 1 && !fIsSet(ncp->flags, NC_MODE_INDEP)) + ? ncp->collective_fh : ncp->independent_fh; - /* set the MPI-IO fileview, this is a collective call */ - err = ncmpio_file_set_view(ncp, fh, &offset, filetype); + TRACE_IO(MPI_File_set_view, (fh, offset, MPI_BYTE, filetype, "native", + MPI_INFO_NULL)); + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + if (status == NC_NOERR) status = err; + } +#else + err = ncmpio_file_set_view(ncp, offset, filetype, 0, NULL, NULL); +#endif if (err != NC_NOERR) { if (status == NC_NOERR) status = err; - nelems = 0; /* skip this request */ + filetype_size = 0; /* skip this request */ } - rw_flag = (fIsSet(reqMode, NC_REQ_RD)) ? NC_REQ_RD : NC_REQ_WR; +#if 1 + /* vard API is only supported when using MPI-IO, not PNCIO */ + int coll_indep = NC_REQ_INDEP; + if (ncp->nprocs > 1 && !fIsSet(ncp->flags, NC_MODE_INDEP)) + coll_indep = NC_REQ_COLL; + + PNCIO_View buf_view; + buf_view.type = MPI_BYTE; + buf_view.size = filetype_size; + buf_view.count = 1; + buf_view.is_contig = 1; + + if (fIsSet(reqMode, NC_REQ_RD)) { + MPI_Offset rlen; + + if (ncp->nprocs > 1 && coll_indep == NC_REQ_COLL) + rlen = ncmpio_file_read_at_all(ncp, 0, xbuf, buf_view); + else + rlen = ncmpio_file_read_at_all(ncp, 0, xbuf, buf_view); + if (status == NC_NOERR && rlen < 0) status = (int)rlen; + } + else { + MPI_Offset wlen; - err = ncmpio_read_write(ncp, rw_flag, coll_indep, offset, nelems, - xtype, xbuf, xtype_is_contig); + if (ncp->nprocs > 1 && coll_indep == NC_REQ_COLL) + wlen = ncmpio_file_write_at_all(ncp, 0, xbuf, buf_view); + else + wlen = ncmpio_file_write_at(ncp, 0, xbuf, buf_view); + if (status == NC_NOERR && wlen < 0) status = (int)wlen; + } +#else + int rw_flag = (fIsSet(reqMode, NC_REQ_RD)) ? NC_REQ_RD : NC_REQ_WR; + + err = ncmpio_read_write(ncp, rw_flag, 0, nelems, xtype, xbuf); if (status == NC_NOERR) status = err; +#endif - /* No longer need to reset the file view, as the root's fileview includes - * the whole file header. - MPI_File_set_view(fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); - */ + /* reset fileview to make entire file visible */ +#if 1 + TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native", + MPI_INFO_NULL)); + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + if (status == NC_NOERR) status = err; + } +#else + err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL); + if (status == NC_NOERR) status = err; +#endif if (fIsSet(reqMode, NC_REQ_RD)) { if (filetype_size == 0) return status; diff --git a/src/drivers/ncmpio/ncmpio_wait.c b/src/drivers/ncmpio/ncmpio_wait.c index fc635acfac..236499db10 100644 --- a/src/drivers/ncmpio/ncmpio_wait.c +++ b/src/drivers/ncmpio/ncmpio_wait.c @@ -34,71 +34,6 @@ NetCDF XDR Level xbuf (XDR I/O buffer) */ -/* Prototypes for functions used only in this file */ -static int wait_getput(NC *ncp, int num_reqs, NC_req *reqs, int rw_flag, - int coll_indep, MPI_Offset newnumrecs); - -static int mgetput(NC *ncp, int num_reqs, NC_req *reqs, int rw_flag, - int coll_indep); - -/*----< ncmpio_getput_zero_req() >-------------------------------------------*/ -/* This function is called when this process has zero-length I/O request and - * must participate all the MPI collective calls involved in the collective - * APIs and wait_all(), which include setting fileview, collective read/write, - * another setting fileview. - * - * This function is collective. - */ -int -ncmpio_getput_zero_req(NC *ncp, int reqMode) -{ - char *mpi_name; - int err, mpireturn, status=NC_NOERR; - MPI_Status mpistatus; - MPI_File fh; - - /* do nothing if this came from an independent API */ - if (fIsSet(reqMode, NC_REQ_INDEP)) return NC_NOERR; - - fh = ncp->collective_fh; - - TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL)); - - if (fIsSet(reqMode, NC_REQ_RD)) { - if (ncp->nprocs > 1) { - TRACE_IO(MPI_File_read_at_all, (fh, 0, NULL, 0, MPI_BYTE, &mpistatus)); - } - else { - TRACE_IO(MPI_File_read_at, (fh, 0, NULL, 0, MPI_BYTE, &mpistatus)); - } - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - err = (err == NC_EFILE) ? NC_EREAD : err; - DEBUG_ASSIGN_ERROR(status, err) - } - } else { /* write request */ - if (ncp->nprocs > 1) { - TRACE_IO(MPI_File_write_at_all, (fh, 0, NULL, 0, MPI_BYTE, &mpistatus)); - } - else { - TRACE_IO(MPI_File_write_at, (fh, 0, NULL, 0, MPI_BYTE, &mpistatus)); - } - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - err = (err == NC_EFILE) ? NC_EWRITE : err; - DEBUG_ASSIGN_ERROR(status, err) - } - } - - /* No longer need to reset the file view, as the root's fileview includes - * the whole file header. - TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native", - MPI_INFO_NULL)); - */ - - return status; -} - /*----< abuf_coalesce() >----------------------------------------------------*/ /* this function should be called after all bput requests have been served */ static int @@ -332,389 +267,115 @@ ncmpio_cancel(void *ncdp, return status; } -/*----< construct_filetypes() >----------------------------------------------*/ -/* concatenate the requests into a single MPI derived filetype */ +/*----< extract_reqs() >-----------------------------------------------------*/ +/* extract requests from the queues into new queues to be committed. + * Input value of num_reqs can be NC_REQ_ALL, NC_GET_REQ_ALL, or NC_PUT_REQ_ALL + */ static int -construct_filetypes(NC *ncp, - NC_lead_req *lead_list, /* NC_REQ_WR or NC_REQ_RD */ - int num_reqs, -#ifdef HAVE_MPI_LARGE_COUNT - MPI_Count *blocklens, /* [num_reqs] temp buffer */ - MPI_Count *disps, /* [num_reqs] temp buffer */ -#else - int *blocklens, /* [num_reqs] temp buffer */ - MPI_Aint *disps, /* [num_reqs] temp buffer */ -#endif - NC_req *reqs, /* [num_reqs] */ - MPI_Datatype *filetype) /* OUT */ +extract_reqs(NC *ncp, + int num_reqs, + int *req_ids, /* IN: [num_reqs] or NULL */ + int *statuses, /* IN: [num_reqs] or NULL */ + int *num_r_lead_reqs, /* OUT: no. lead get requests */ + int *num_r_reqs, /* OUT: no. non-lead get requests */ + NC_req **get_list, /* OUT: extracted get requests */ + int *num_w_lead_reqs, /* OUT: no. lead put requests */ + int *num_w_reqs, /* OUT: no. non-lead put requests */ + NC_req **put_list) /* OUT: extracted put requests */ { - int i, j, err, status=NC_NOERR, all_ftype_contig=1, last_contig_req; - int mpireturn; - MPI_Datatype *ftypes; - - if (num_reqs <= 0) { /* for participating collective call */ - *filetype = MPI_BYTE; - return NC_NOERR;; - } + int i, j, status=NC_NOERR; + NC_req *put_list_ptr, *get_list_ptr; - /* hereinafter, num_reqs > 0 */ - ftypes = (MPI_Datatype*) NCI_Malloc(sizeof(MPI_Datatype) * num_reqs); + *num_r_lead_reqs = 0; + *num_w_lead_reqs = 0; + *num_r_reqs = 0; + *num_w_reqs = 0; - /* create a filetype for each request */ - last_contig_req = -1; /* index of the last contiguous request */ - j = 0; /* index of last valid ftypes */ - for (i=0; inumLeadPutReqs; i++) + fSet(ncp->put_lead_list[i].flag, NC_REQ_TO_FREE); - lead = lead_list + reqs[i].lead_off; - ndims = lead->varp->ndims; + *num_w_lead_reqs = ncp->numLeadPutReqs; + *num_w_reqs = ncp->numPutReqs; + *put_list = ncp->put_list; + ncp->numPutReqs = 0; + ncp->put_list = NULL; + } + if (num_reqs == NC_GET_REQ_ALL || num_reqs == NC_REQ_ALL) { + /* the entire get requests */ + for (i=0; inumLeadGetReqs; i++) + fSet(ncp->get_lead_list[i].flag, NC_REQ_TO_FREE); - ftypes[j] = MPI_BYTE; /* in case the call below failed */ + *num_r_lead_reqs = ncp->numLeadGetReqs; + *num_r_reqs = ncp->numGetReqs; + *get_list = ncp->get_list; + ncp->numGetReqs = 0; + ncp->get_list = NULL; + } + if (num_reqs == NC_REQ_ALL || num_reqs == NC_GET_REQ_ALL || + num_reqs == NC_PUT_REQ_ALL) + return NC_NOERR; - if (ndims == 0) { /* scalar variable */ -#if SIZEOF_MPI_AINT < SIZEOF_MPI_OFFSET - if (lead->varp->begin > NC_MAX_INT) { - DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) - fSet(lead->flag, NC_REQ_SKIP); /* skip this request */ - if ( lead->status != NULL && - *lead->status == NC_NOERR) - *lead->status = err; - if (status == NC_NOERR) - status = err; /* report first error */ + if (ncp->numGetReqs == 0 && num_reqs == ncp->numLeadPutReqs) { + /* this is the same as NC_PUT_REQ_ALL */ + for (i=0; inumLeadPutReqs; i++) { + ncp->put_lead_list[i].status = statuses + i; + statuses[i] = NC_NOERR; } -#endif - disps[j] = lead->varp->begin; - is_ftype_contig = 1; } - else { /* non-scalar variable */ - MPI_Offset offset, *count, *stride; - count = reqs[i].start + ndims; - stride = fIsSet(lead->flag, NC_REQ_STRIDE_NULL) ? - NULL : count + ndims; - - err = ncmpio_filetype_create_vars(ncp, - lead->varp, - reqs[i].start, - count, - stride, - &offset, - &ftypes[j], - &is_ftype_contig); - -#if SIZEOF_MPI_AINT < SIZEOF_MPI_OFFSET - if (err == NC_NOERR && offset > NC_MAX_INT) - DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) -#endif - disps[j] = (MPI_Aint)offset; + for (i=0; inumLeadPutReqs; i++) + fSet(ncp->put_lead_list[i].flag, NC_REQ_TO_FREE); - if (err != NC_NOERR) { - fSet(lead->flag, NC_REQ_SKIP); /* skip this request */ - if ( lead->status != NULL && - *lead->status == NC_NOERR) - *lead->status = err; - if (status == NC_NOERR) status = err; /* report first error */ - continue; + *num_w_lead_reqs = ncp->numLeadPutReqs; + *num_w_reqs = ncp->numPutReqs; + *put_list = ncp->put_list; + ncp->numPutReqs = 0; + ncp->put_list = NULL; + return NC_NOERR; + } + if (ncp->numPutReqs == 0 && num_reqs == ncp->numLeadGetReqs) { + /* this is the same as NC_GET_REQ_ALL */ + for (i=0; inumLeadGetReqs; i++) { + ncp->get_lead_list[i].status = statuses + i; + statuses[i] = NC_NOERR; } } + for (i=0; inumLeadGetReqs; i++) + fSet(ncp->get_lead_list[i].flag, NC_REQ_TO_FREE); - if (is_ftype_contig) { - MPI_Offset coalesced_len; + *num_r_lead_reqs = ncp->numLeadGetReqs; + *num_r_reqs = ncp->numGetReqs; + *get_list = ncp->get_list; + ncp->numGetReqs = 0; + ncp->get_list = NULL; + return NC_NOERR; + } + if (num_reqs == ncp->numLeadPutReqs + ncp->numLeadGetReqs && + statuses == NULL) { + /* this is the same as NC_REQ_ALL */ + for (i=0; ivarp->xsz * reqs[i].nelems; + for (i=0; inumLeadGetReqs; i++) + fSet(ncp->get_lead_list[i].flag, NC_REQ_TO_FREE); + *num_w_lead_reqs = ncp->numLeadPutReqs; + *num_w_reqs = ncp->numPutReqs; + *put_list = ncp->put_list; + ncp->numPutReqs = 0; + ncp->put_list = NULL; -#ifdef HAVE_MPI_LARGE_COUNT - blocklens[j] = coalesced_len; -#else - if (coalesced_len > NC_MAX_INT) { - DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) - if (status == NC_NOERR) - status = err; /* report first error */ - coalesced_len = 0; - } - blocklens[j] = (int)coalesced_len; -#endif - if (last_contig_req >= 0) - coalesced_len += blocklens[last_contig_req]; -#ifdef HAVE_MPI_LARGE_COUNT - if (last_contig_req >= 0 && - disps[j] - disps[last_contig_req] == - blocklens[last_contig_req]) { - blocklens[last_contig_req] = coalesced_len; - j--; - } - else last_contig_req = j; -#else - /* if coalesced_len overflows 4-byte int, then skip coalescing */ - if (coalesced_len < NC_MAX_INT && last_contig_req >= 0 && - disps[j] - disps[last_contig_req] == - blocklens[last_contig_req]) { - blocklens[last_contig_req] = (int)coalesced_len; - j--; - } - else last_contig_req = j; -#endif - } - else { - /* we will construct a filetype, set blocklen to 1 */ - blocklens[j] = 1; - last_contig_req = -1; - all_ftype_contig = 0; - } - } - /* j is the new num_reqs */ - num_reqs = j; - - if (status != NC_NOERR) { - /* even if error occurs, we still must participate the collective - call to MPI_File_set_view() */ - *filetype = MPI_BYTE; - } - else if (num_reqs == 1 && disps[0] == 0) { - if (ftypes[0] == MPI_BYTE) - *filetype = MPI_BYTE; - else { - mpireturn = MPI_Type_dup(ftypes[0], filetype); - if (mpireturn != MPI_SUCCESS) - err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_dup"); - } - } - else { /* if (num_reqs > 1 || (num_reqs == 1 && disps[0] > 0)) */ - /* all ftypes[] created fine, now concatenate all ftypes[] */ - if (all_ftype_contig) { -#ifdef HAVE_MPI_LARGE_COUNT - mpireturn = MPI_Type_create_hindexed_c(num_reqs, blocklens, disps, - MPI_BYTE, filetype); -#else - mpireturn = MPI_Type_create_hindexed(num_reqs, blocklens, disps, - MPI_BYTE, filetype); -#endif - if (mpireturn != MPI_SUCCESS) - err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_hindexed"); - else { - MPI_Type_commit(filetype); - err = NC_NOERR; - } - } - else { -#ifdef HAVE_MPI_LARGE_COUNT - mpireturn = MPI_Type_create_struct_c(num_reqs, blocklens, disps, - ftypes, filetype); -#else - mpireturn = MPI_Type_create_struct(num_reqs, blocklens, disps, - ftypes, filetype); -#endif - if (mpireturn != MPI_SUCCESS) - err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_struct"); - else { - MPI_Type_commit(filetype); - err = NC_NOERR; - } - } - - if (err != NC_NOERR) *filetype = MPI_BYTE; - if (status == NC_NOERR) status = err; /* report the first error */ - } - - for (i=0; i--------------------------------------------*/ -/* the input requests, reqs[], are non-interleaving requests */ -static int -construct_buffertypes(NC_lead_req *lead_list, - int num_reqs, -#ifdef HAVE_MPI_LARGE_COUNT - MPI_Count *blocklens, /* [num_reqs] temp buffer */ - MPI_Count *disps, /* [num_reqs] temp buffer */ -#else - int *blocklens, /* [num_reqs] temp buffer */ - MPI_Aint *disps, /* [num_reqs] temp buffer */ -#endif - NC_req *reqs, /* [num_reqs] */ - MPI_Datatype *buf_type) /* OUT */ -{ - int i, j, k, status=NC_NOERR, mpireturn; - MPI_Aint a0, ai; - - *buf_type = MPI_BYTE; - if (num_reqs == 0) return NC_NOERR; - - /* create the I/O buffer derived data type */ - - /* calculate blocklens[], and disps[] */ - for (i=0, j=0; iflag, NC_REQ_SKIP)) continue; - - req_size = lead->varp->xsz; - if (lead->varp->ndims > 0) { /* non-scalar variable */ - MPI_Offset *count = reqs[i].start + lead->varp->ndims; - if (!IS_RECVAR(lead->varp)) req_size *= count[0]; - for (k=1; kvarp->ndims; k++) req_size *= count[k]; - } - -#ifdef HAVE_MPI_LARGE_COUNT - blocklens[j] = req_size; -#else - /* check int overflow */ - if (req_size > NC_MAX_INT) { /* skip this request */ - fSet(lead->flag, NC_REQ_SKIP); - DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW) - continue; - } - blocklens[j] = (int)req_size; -#endif - - MPI_Get_address(reqs[i].xbuf, &ai); - if (j == 0) a0 = ai; - disps[j] = MPI_Aint_diff(ai, a0); - j++; - } - /* update num_reqs to number of valid requests */ - num_reqs = j; - - if (num_reqs > 0) { - /* concatenate buffer addresses into a single buffer type */ -#ifdef HAVE_MPI_LARGE_COUNT - mpireturn = MPI_Type_create_hindexed_c(num_reqs, blocklens, disps, - MPI_BYTE, buf_type); -#else - mpireturn = MPI_Type_create_hindexed(num_reqs, blocklens, disps, - MPI_BYTE, buf_type); -#endif - if (mpireturn != MPI_SUCCESS) { - int err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed"); - /* return the first encountered error if there is any */ - if (status == NC_NOERR) status = err; - } - else - MPI_Type_commit(buf_type); - } - - return status; -} - -/*----< extract_reqs() >-----------------------------------------------------*/ -/* extract requests from the queues into new queues to be committed. - * Input value of num_reqs can be NC_REQ_ALL, NC_GET_REQ_ALL, or NC_PUT_REQ_ALL - */ -static int -extract_reqs(NC *ncp, - int num_reqs, - int *req_ids, /* IN: [num_reqs] or NULL */ - int *statuses, /* IN: [num_reqs] or NULL */ - int *num_r_lead_reqs, /* OUT: no. lead get requests */ - int *num_r_reqs, /* OUT: no. non-lead get requests */ - NC_req **get_list, /* OUT: extracted get requests */ - int *num_w_lead_reqs, /* OUT: no. lead put requests */ - int *num_w_reqs, /* OUT: no. non-lead put requests */ - NC_req **put_list) /* OUT: extracted put requests */ -{ - int i, j, status=NC_NOERR; - NC_req *put_list_ptr, *get_list_ptr; - - *num_r_lead_reqs = 0; - *num_w_lead_reqs = 0; - *num_r_reqs = 0; - *num_w_reqs = 0; - - if (num_reqs == NC_PUT_REQ_ALL || num_reqs == NC_REQ_ALL) { - /* the entire put requests */ - for (i=0; inumLeadPutReqs; i++) - fSet(ncp->put_lead_list[i].flag, NC_REQ_TO_FREE); - - *num_w_lead_reqs = ncp->numLeadPutReqs; - *num_w_reqs = ncp->numPutReqs; - *put_list = ncp->put_list; - ncp->numPutReqs = 0; - ncp->put_list = NULL; - } - if (num_reqs == NC_GET_REQ_ALL || num_reqs == NC_REQ_ALL) { - /* the entire get requests */ - for (i=0; inumLeadGetReqs; i++) - fSet(ncp->get_lead_list[i].flag, NC_REQ_TO_FREE); - - *num_r_lead_reqs = ncp->numLeadGetReqs; - *num_r_reqs = ncp->numGetReqs; - *get_list = ncp->get_list; - ncp->numGetReqs = 0; - ncp->get_list = NULL; - } - if (num_reqs == NC_REQ_ALL || num_reqs == NC_GET_REQ_ALL || - num_reqs == NC_PUT_REQ_ALL) - return NC_NOERR; - - if (ncp->numGetReqs == 0 && num_reqs == ncp->numLeadPutReqs) { - /* this is the same as NC_PUT_REQ_ALL */ - for (i=0; inumLeadPutReqs; i++) { - ncp->put_lead_list[i].status = statuses + i; - statuses[i] = NC_NOERR; - } - } - for (i=0; inumLeadPutReqs; i++) - fSet(ncp->put_lead_list[i].flag, NC_REQ_TO_FREE); - - *num_w_lead_reqs = ncp->numLeadPutReqs; - *num_w_reqs = ncp->numPutReqs; - *put_list = ncp->put_list; - ncp->numPutReqs = 0; - ncp->put_list = NULL; - return NC_NOERR; - } - if (ncp->numPutReqs == 0 && num_reqs == ncp->numLeadGetReqs) { - /* this is the same as NC_GET_REQ_ALL */ - for (i=0; inumLeadGetReqs; i++) { - ncp->get_lead_list[i].status = statuses + i; - statuses[i] = NC_NOERR; - } - } - for (i=0; inumLeadGetReqs; i++) - fSet(ncp->get_lead_list[i].flag, NC_REQ_TO_FREE); - - *num_r_lead_reqs = ncp->numLeadGetReqs; - *num_r_reqs = ncp->numGetReqs; - *get_list = ncp->get_list; - ncp->numGetReqs = 0; - ncp->get_list = NULL; - return NC_NOERR; - } - if (num_reqs == ncp->numLeadPutReqs + ncp->numLeadGetReqs && - statuses == NULL) { - /* this is the same as NC_REQ_ALL */ - for (i=0; inumLeadGetReqs; i++) - fSet(ncp->get_lead_list[i].flag, NC_REQ_TO_FREE); - *num_w_lead_reqs = ncp->numLeadPutReqs; - *num_w_reqs = ncp->numPutReqs; - *put_list = ncp->put_list; - ncp->numPutReqs = 0; - ncp->put_list = NULL; - - for (i=0; inumLeadPutReqs; i++) - fSet(ncp->put_lead_list[i].flag, NC_REQ_TO_FREE); - *num_r_lead_reqs = ncp->numLeadGetReqs; - *num_r_reqs = ncp->numGetReqs; - *get_list = ncp->get_list; - ncp->numGetReqs = 0; - ncp->get_list = NULL; - return NC_NOERR; + for (i=0; inumLeadPutReqs; i++) + fSet(ncp->put_lead_list[i].flag, NC_REQ_TO_FREE); + *num_r_lead_reqs = ncp->numLeadGetReqs; + *num_r_reqs = ncp->numGetReqs; + *get_list = ncp->get_list; + ncp->numGetReqs = 0; + ncp->get_list = NULL; + return NC_NOERR; } /* requests are a subset of pending requests */ @@ -797,7 +458,7 @@ extract_reqs(NC *ncp, req_ids[i] == ncp->put_lead_list[j].id) { memcpy(put_list_ptr, ncp->put_list + ncp->put_lead_list[j].nonlead_off, - ncp->put_lead_list[j].nonlead_num * sizeof(NC_req)); + sizeof(NC_req) * ncp->put_lead_list[j].nonlead_num); put_list_ptr += ncp->put_lead_list[j].nonlead_num; req_ids[i] = NC_REQ_NULL; break; @@ -810,7 +471,7 @@ extract_reqs(NC *ncp, req_ids[i] == ncp->get_lead_list[j].id) { memcpy(get_list_ptr, ncp->get_list + ncp->get_lead_list[j].nonlead_off, - ncp->get_lead_list[j].nonlead_num * sizeof(NC_req)); + sizeof(NC_req) * ncp->get_lead_list[j].nonlead_num); get_list_ptr += ncp->get_lead_list[j].nonlead_num; req_ids[i] = NC_REQ_NULL; break; @@ -987,30 +648,72 @@ req_commit(NC *ncp, do_write = (num_w_reqs > 0); } +#if 1 /* carry out writes and reads separately (writes first) */ - if (do_write > 0) { + err = ncmpio_ina_nreqs(ncp, NC_REQ_WR, num_w_reqs, put_list, + newnumrecs); + put_list = NULL; /* has been freed in the above call */ + + /* Update the number of records if new records have been created. + * For nonblocking APIs, there is no way for a process to know whether + * others write to a record variable or not. Note newnumrecs has been + * sync-ed and always >= ncp->numrecs. + */ + if (coll_indep == NC_REQ_COLL) { + if (newnumrecs > ncp->numrecs) { + /* update new record number in file. Note newnumrecs is already + * sync-ed among all processes and in collective mode + * ncp->numrecs is always sync-ed in memory among processes, + * thus no need another MPI_Allreduce to sync it. */ + err = ncmpio_write_numrecs(ncp, newnumrecs); + if (status == NC_NOERR) status = err; + /* retain the first error if there is any */ + if (ncp->numrecs < newnumrecs) ncp->numrecs = newnumrecs; + } + } + else { /* NC_REQ_INDEP */ + if (ncp->numrecs < newnumrecs) { + ncp->numrecs = newnumrecs; + set_NC_ndirty(ncp); + /* delay numrecs sync until end_indep, redef or close */ + } + } + } + if (do_read > 0) { + err = ncmpio_ina_nreqs(ncp, NC_REQ_RD, num_r_reqs, get_list, + newnumrecs); + get_list = NULL; /* has been freed in the above call */ + } +#else + if (do_write > 0) { + if (ncp->num_aggrs_per_node > 0 && coll_indep == NC_REQ_COLL) + /* intra-node aggregation must be in collective mode */ + err = ncmpio_intra_node_aggregation_nreqs(ncp, NC_REQ_WR, + num_w_reqs, put_list, + newnumrecs); + else + err = wait_getput(ncp, num_w_reqs, put_list, NC_REQ_WR, coll_indep, + newnumrecs); + put_list = NULL; /* has been freed in wait_getput() */ + } + + if (do_read > 0) { + if (ncp->num_aggrs_per_node > 0 && coll_indep == NC_REQ_COLL) + /* intra-node aggregation must be in collective mode */ + err = ncmpio_intra_node_aggregation_nreqs(ncp, NC_REQ_RD, + num_r_reqs, get_list, + newnumrecs); + else + err = wait_getput(ncp, num_r_reqs, get_list, NC_REQ_RD, coll_indep, + newnumrecs); + get_list = NULL; /* has been freed in wait_getput() */ + } +#endif + + /* retain the first error status */ + if (status == NC_NOERR) status = err; - if (ncp->my_aggr >= 0 && coll_indep == NC_REQ_COLL && ncp->nprocs > 1) - /* intra-node write aggregation must be in collective mode */ - err = ncmpio_intra_node_aggregation_nreqs(ncp, NC_REQ_WR, - num_w_reqs, put_list, - newnumrecs); - else - err = wait_getput(ncp, num_w_reqs, put_list, NC_REQ_WR, coll_indep, - newnumrecs); - put_list = NULL; /* has been freed in wait_getput() */ - } - - if (do_read > 0) { - err = wait_getput(ncp, num_r_reqs, get_list, NC_REQ_RD, coll_indep, - newnumrecs); - get_list = NULL; /* has been freed in wait_getput() */ - } - - /* retain the first error status */ - if (status == NC_NOERR) status = err; - /* post-IO data processing: In write case, we may need to byte-swap user * write buf if it is used as the write buffer in MPI write call and the * target machine is little Endian. For read case, we may need to @@ -1114,137 +817,424 @@ req_commit(NC *ncp, j++; } } - ncp->numLeadGetReqs = j; - if (ncp->numLeadGetReqs == 0) { - NCI_Free(ncp->get_list); - NCI_Free(ncp->get_lead_list); - ncp->get_list = NULL; - ncp->get_lead_list = NULL; - } + ncp->numLeadGetReqs = j; + if (ncp->numLeadGetReqs == 0) { + NCI_Free(ncp->get_list); + NCI_Free(ncp->get_lead_list); + ncp->get_list = NULL; + ncp->get_lead_list = NULL; + } + } + + return status; +} + +/*----< ncmpio_wait() >-------------------------------------------------------*/ +int +ncmpio_wait(void *ncdp, + int num_reqs, + int *req_ids, /* [num_reqs]: IN/OUT */ + int *statuses, /* [num_reqs] */ + int reqMode) /* only check if NC_REQ_COLL or NC_REQ_INDEP */ +{ + NC *ncp = (NC*)ncdp; + int coll_indep; + + if (NC_indef(ncp)) /* wait must be called in data mode */ + DEBUG_RETURN_ERROR(NC_EINDEFINE) + + coll_indep = (fIsSet(reqMode, NC_REQ_INDEP)) ? NC_REQ_INDEP : NC_REQ_COLL; + +#ifdef ENABLE_REQ_AGGREGATION + /* check collective or independent mode */ + if (coll_indep == NC_REQ_INDEP && !NC_indep(ncp)) + DEBUG_RETURN_ERROR(NC_ENOTINDEP) + else if (coll_indep == NC_REQ_COLL && NC_indep(ncp)) + DEBUG_RETURN_ERROR(NC_EINDEP) + + if (coll_indep == NC_REQ_INDEP && num_reqs == 0) return NC_NOERR; + + return req_commit(ncp, num_reqs, req_ids, statuses, coll_indep); +#else + /* If request aggregation is disabled, we call an independent wait() for + * each request + */ + int i, status=NC_NOERR, err; + + if (coll_indep == NC_REQ_INDEP) { + /* This is called from ncmpi_wait(), which is an independent call + * Argument num_reqs can be NC_REQ_ALL which means to flush all pending + * nonblocking requests. In this case, arguments req_ids and statuses + * will be ignored. + * Argument num_reqs must either be NC_REQ_ALL, NC_GET_REQ_ALL, + * NC_PUT_REQ_ALL, or a non-negative value. + * Argument statuses can be NULL, meaning the caller only cares about + * the error code returned by this call, but not the statuses of + * individual nonblocking requests. + */ + if (num_reqs == 0) return NC_NOERR; + + /* This is called from ncmpi_wait which must be called in independent + * data mode, illegal in collective mode. + */ + if (!NC_indep(ncp)) DEBUG_RETURN_ERROR(NC_ENOTINDEP); + + if (coll_indep == NC_REQ_INDEP && num_reqs == 0) return NC_NOERR; + } + else { + /* This is called from ncmpi_wait_all(), which is a collective call + * Argument num_reqs can be NC_REQ_ALL which means to flush all pending + * nonblocking requests. In this case, arguments req_ids and statuses + * will be ignored. + * Argument num_reqs must either be NC_REQ_ALL, NC_GET_REQ_ALL, + * NC_PUT_REQ_ALL, or a non-negative value. + * Argument statuses can be NULL, meaning the caller only cares about + * the error code returned by this call, but not the statuses of + * individual nonblocking requests. + */ + /* the following line CANNOT be added, because ncmpi_wait_all() is a + * collective call, all processes must participate some MPI collective + * operations used later on. + */ + /* if (num_reqs == 0) return NC_NOERR; */ + + /* This is called from ncmpi_wait_all which must be called in + * collective data mode, illegal in independent mode. This also + * ensures the program will returns back to collective mode. + */ + if (NC_indep(ncp)) DEBUG_RETURN_ERROR(NC_EINDEP); + + /* must enter independent mode, as num_reqs may be different among + processes */ + err = ncmpio_begin_indep_data(ncp); + if (status == NC_NOERR) status = err; + } + + if (num_reqs <= NC_REQ_ALL) { /* flush all get or put pending requests */ + if (num_reqs == NC_REQ_ALL || num_reqs == NC_GET_REQ_ALL) { + while (ncp->numLeadGetReqs) { + /* commit one request at a time. Note ncp->numLeadGetReqs + * will be descreased in req_commit() + */ + err = req_commit(ncp, 1, &ncp->get_lead_list[0].id, NULL, + NC_REQ_INDEP); + if (status == NC_NOERR) status = err; + } + } + if (num_reqs == NC_REQ_ALL || num_reqs == NC_PUT_REQ_ALL) { + while (ncp->numLeadPutReqs) { + /* commit one request at a time. Note ncp->numLeadPutReqs + * will be descreased in req_commit() + */ + err = req_commit(ncp, 1, &ncp->put_lead_list[0].id, NULL, + NC_REQ_INDEP); + if (status == NC_NOERR) status = err; + } + } + } + else { + for (i=0; i----------------------------------------------*/ +/* concatenate the requests into a single MPI derived filetype */ +static int +construct_filetypes(NC *ncp, + NC_lead_req *lead_list, /* NC_REQ_WR or NC_REQ_RD */ + int num_reqs, +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *blocklens, /* [num_reqs] temp buffer */ + MPI_Count *disps, /* [num_reqs] temp buffer */ +#else + int *blocklens, /* [num_reqs] temp buffer */ + MPI_Aint *disps, /* [num_reqs] temp buffer */ +#endif + NC_req *reqs, /* [num_reqs] */ + MPI_Datatype *filetype) /* OUT */ +{ + int i, j, err, status=NC_NOERR, all_ftype_contig=1, last_contig_req; + int mpireturn; + MPI_Datatype *ftypes; + + if (num_reqs <= 0) { /* for participating collective call */ + *filetype = MPI_BYTE; + return NC_NOERR;; + } + + /* hereinafter, num_reqs > 0 */ + ftypes = (MPI_Datatype*) NCI_Malloc(sizeof(MPI_Datatype) * num_reqs); + + /* create a filetype for each request */ + last_contig_req = -1; /* index of the last contiguous request */ + j = 0; /* index of last valid ftypes */ + for (i=0; ivarp->ndims; + + ftypes[j] = MPI_BYTE; /* in case the call below failed */ + + if (ndims == 0) { /* scalar variable */ +#if SIZEOF_MPI_AINT < SIZEOF_MPI_OFFSET + if (lead->varp->begin > NC_MAX_INT) { + DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) + fSet(lead->flag, NC_REQ_SKIP); /* skip this request */ + if ( lead->status != NULL && + *lead->status == NC_NOERR) + *lead->status = err; + if (status == NC_NOERR) + status = err; /* report first error */ + } +#endif + disps[j] = lead->varp->begin; + is_ftype_contig = 1; + } + else if (reqs[i].npairs == 1) { /* only one offset-length pair */ + /* reqs[i].offset_start has been set back in wait_getput() */ + disps[j] = reqs[i].offset_start; + is_ftype_contig = 1; + } + else { /* non-scalar variable with more offset-length pairs */ + MPI_Offset offset, *count, *stride; + count = reqs[i].start + ndims; + stride = fIsSet(lead->flag, NC_REQ_STRIDE_NULL) ? + NULL : count + ndims; + + err = ncmpio_filetype_create_vars(ncp, + lead->varp, + reqs[i].start, + count, + stride, + &offset, + &ftypes[j], + &is_ftype_contig); + +#if SIZEOF_MPI_AINT < SIZEOF_MPI_OFFSET + if (err == NC_NOERR && offset > NC_MAX_INT) + DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) +#endif + disps[j] = (MPI_Aint)offset; + + if (err != NC_NOERR) { + fSet(lead->flag, NC_REQ_SKIP); /* skip this request */ + if ( lead->status != NULL && + *lead->status == NC_NOERR) + *lead->status = err; + if (status == NC_NOERR) status = err; /* report first error */ + continue; + } + } + + if (is_ftype_contig) { + MPI_Offset coalesced_len; + + /* No need to construct a filetype */ + coalesced_len = lead->varp->xsz * reqs[i].nelems; + +#ifdef HAVE_MPI_LARGE_COUNT + blocklens[j] = coalesced_len; +#else + if (coalesced_len > NC_MAX_INT) { + DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) + if (status == NC_NOERR) + status = err; /* report first error */ + coalesced_len = 0; + } + blocklens[j] = (int)coalesced_len; +#endif + if (last_contig_req >= 0) + coalesced_len += blocklens[last_contig_req]; +#ifdef HAVE_MPI_LARGE_COUNT + if (last_contig_req >= 0 && + disps[j] - disps[last_contig_req] == + blocklens[last_contig_req]) { + blocklens[last_contig_req] = coalesced_len; + j--; + } + else last_contig_req = j; +#else + /* if coalesced_len overflows 4-byte int, then skip coalescing */ + if (coalesced_len < NC_MAX_INT && last_contig_req >= 0 && + disps[j] - disps[last_contig_req] == + blocklens[last_contig_req]) { + blocklens[last_contig_req] = (int)coalesced_len; + j--; + } + else last_contig_req = j; +#endif + } + else { + /* we will construct a filetype, set blocklen to 1 */ + blocklens[j] = 1; + last_contig_req = -1; + all_ftype_contig = 0; + } + } + /* j is the new num_reqs */ + num_reqs = j; + + if (status != NC_NOERR) { + /* even if error occurs, we still must participate the collective + call to MPI_File_set_view() */ + *filetype = MPI_BYTE; + } + else if (num_reqs == 1 && disps[0] == 0) { + if (ftypes[0] == MPI_BYTE) + *filetype = MPI_BYTE; + else { + mpireturn = MPI_Type_dup(ftypes[0], filetype); + if (mpireturn != MPI_SUCCESS) + err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_dup"); + } + } + else { /* if (num_reqs > 1 || (num_reqs == 1 && disps[0] > 0)) */ + /* all ftypes[] created fine, now concatenate all ftypes[] */ + if (all_ftype_contig) { +#ifdef HAVE_MPI_LARGE_COUNT + mpireturn = MPI_Type_create_hindexed_c(num_reqs, blocklens, disps, + MPI_BYTE, filetype); +#else + mpireturn = MPI_Type_create_hindexed(num_reqs, blocklens, disps, + MPI_BYTE, filetype); +#endif + if (mpireturn != MPI_SUCCESS) + err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_hindexed"); + else { + MPI_Type_commit(filetype); + err = NC_NOERR; + } + } + else { +#ifdef HAVE_MPI_LARGE_COUNT + mpireturn = MPI_Type_create_struct_c(num_reqs, blocklens, disps, + ftypes, filetype); +#else + mpireturn = MPI_Type_create_struct(num_reqs, blocklens, disps, + ftypes, filetype); +#endif + if (mpireturn != MPI_SUCCESS) + err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_struct"); + else { + MPI_Type_commit(filetype); + err = NC_NOERR; + } + } + + if (err != NC_NOERR) *filetype = MPI_BYTE; + if (status == NC_NOERR) status = err; /* report the first error */ + } + + for (i=0; i-------------------------------------------------------*/ -int -ncmpio_wait(void *ncdp, - int num_reqs, - int *req_ids, /* [num_reqs]: IN/OUT */ - int *statuses, /* [num_reqs] */ - int reqMode) /* only check if NC_REQ_COLL or NC_REQ_INDEP */ +/*----< construct_buffertypes() >--------------------------------------------*/ +/* the input requests, reqs[], are non-interleaving requests */ +static int +construct_buffertypes(NC_lead_req *lead_list, + int num_reqs, +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *blocklens, /* [num_reqs] temp buffer */ + MPI_Count *disps, /* [num_reqs] temp buffer */ +#else + int *blocklens, /* [num_reqs] temp buffer */ + MPI_Aint *disps, /* [num_reqs] temp buffer */ +#endif + NC_req *reqs, /* [num_reqs] */ + MPI_Datatype *buf_type) /* OUT */ { - NC *ncp = (NC*)ncdp; - int coll_indep; - - if (NC_indef(ncp)) /* wait must be called in data mode */ - DEBUG_RETURN_ERROR(NC_EINDEFINE) - - coll_indep = (fIsSet(reqMode, NC_REQ_INDEP)) ? NC_REQ_INDEP : NC_REQ_COLL; + int i, j, k, status=NC_NOERR, mpireturn; + MPI_Aint a0, ai; -#ifdef ENABLE_REQ_AGGREGATION - /* check collective or independent mode */ - if (coll_indep == NC_REQ_INDEP && !NC_indep(ncp)) - DEBUG_RETURN_ERROR(NC_ENOTINDEP) - else if (coll_indep == NC_REQ_COLL && NC_indep(ncp)) - DEBUG_RETURN_ERROR(NC_EINDEP) + *buf_type = MPI_BYTE; + if (num_reqs == 0) return NC_NOERR; - if (coll_indep == NC_REQ_INDEP && num_reqs == 0) return NC_NOERR; + /* create the I/O buffer derived data type */ - return req_commit(ncp, num_reqs, req_ids, statuses, coll_indep); -#else - /* If request aggregation is disabled, we call an independent wait() for - * each request - */ - int i, status=NC_NOERR, err; + /* calculate blocklens[], and disps[] */ + for (i=0, j=0; iflag, NC_REQ_SKIP)) continue; - if (coll_indep == NC_REQ_INDEP && num_reqs == 0) return NC_NOERR; - } - else { - /* This is called from ncmpi_wait_all(), which is a collective call - * Argument num_reqs can be NC_REQ_ALL which means to flush all pending - * nonblocking requests. In this case, arguments req_ids and statuses - * will be ignored. - * Argument num_reqs must either be NC_REQ_ALL, NC_GET_REQ_ALL, - * NC_PUT_REQ_ALL, or a non-negative value. - * Argument statuses can be NULL, meaning the caller only cares about - * the error code returned by this call, but not the statuses of - * individual nonblocking requests. - */ - /* the following line CANNOT be added, because ncmpi_wait_all() is a - * collective call, all processes must participate some MPI collective - * operations used later on. - */ - /* if (num_reqs == 0) return NC_NOERR; */ + req_size = lead->varp->xsz; + if (lead->varp->ndims > 0) { /* non-scalar variable */ + MPI_Offset *count = reqs[i].start + lead->varp->ndims; + if (!IS_RECVAR(lead->varp)) req_size *= count[0]; + for (k=1; kvarp->ndims; k++) req_size *= count[k]; + } - /* This is called from ncmpi_wait_all which must be called in - * collective data mode, illegal in independent mode. This also - * ensures the program will returns back to collective mode. - */ - if (NC_indep(ncp)) DEBUG_RETURN_ERROR(NC_EINDEP); +#ifdef HAVE_MPI_LARGE_COUNT + blocklens[j] = req_size; +#else + /* check int overflow */ + if (req_size > NC_MAX_INT) { /* skip this request */ + fSet(lead->flag, NC_REQ_SKIP); + DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW) + continue; + } + blocklens[j] = (int)req_size; +#endif - /* must enter independent mode, as num_reqs may be different among - processes */ - err = ncmpio_begin_indep_data(ncp); - if (status == NC_NOERR) status = err; + MPI_Get_address(reqs[i].xbuf, &ai); + if (j == 0) a0 = ai; + disps[j] = MPI_Aint_diff(ai, a0); + j++; } + /* update num_reqs to number of valid requests */ + num_reqs = j; - if (num_reqs <= NC_REQ_ALL) { /* flush all get or put pending requests */ - if (num_reqs == NC_REQ_ALL || num_reqs == NC_GET_REQ_ALL) { - while (ncp->numLeadGetReqs) { - /* commit one request at a time. Note ncp->numLeadGetReqs - * will be descreased in req_commit() - */ - err = req_commit(ncp, 1, &ncp->get_lead_list[0].id, NULL, - NC_REQ_INDEP); - if (status == NC_NOERR) status = err; - } - } - if (num_reqs == NC_REQ_ALL || num_reqs == NC_PUT_REQ_ALL) { - while (ncp->numLeadPutReqs) { - /* commit one request at a time. Note ncp->numLeadPutReqs - * will be descreased in req_commit() - */ - err = req_commit(ncp, 1, &ncp->put_lead_list[0].id, NULL, - NC_REQ_INDEP); - if (status == NC_NOERR) status = err; - } - } - } - else { - for (i=0; i 0) { + /* concatenate buffer addresses into a single buffer type */ +#ifdef HAVE_MPI_LARGE_COUNT + mpireturn = MPI_Type_create_hindexed_c(num_reqs, blocklens, disps, + MPI_BYTE, buf_type); +#else + mpireturn = MPI_Type_create_hindexed(num_reqs, blocklens, disps, + MPI_BYTE, buf_type); +#endif + if (mpireturn != MPI_SUCCESS) { + int err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed"); + /* return the first encountered error if there is any */ if (status == NC_NOERR) status = err; } + else + MPI_Type_commit(buf_type); } - if (coll_indep == NC_REQ_COLL) { - /* return to collective data mode */ - err = ncmpio_end_indep_data(ncp); - if (status == NC_NOERR) status = err; - } - - return status; /* return the first error encountered, if there is any */ -#endif + return status; } /* C struct for breaking down a request to a list of offset-length segments */ @@ -1381,8 +1371,8 @@ merge_requests(NC *ncp, MPI_Offset *nsegs, /* OUT: no. off-len pairs */ off_len **segs) /* OUT: [*nsegs] */ { - int i, j, status=NC_NOERR, ndims; - MPI_Offset nseg, *start, *count, *shape, *stride; + int i, j, status=NC_NOERR, ndims, is_incr; + MPI_Offset nseg, *start, *count, *shape, *stride, prev_offset; MPI_Aint addr, buf_addr; *nsegs = 0; /* total number of offset-length pairs */ @@ -1397,43 +1387,18 @@ merge_requests(NC *ncp, /* Count the number off-len pairs from reqs[], so we can malloc a * contiguous memory space for storing off-len pairs */ - for (i=0; ivarp->ndims; - if (ndims > 0) { - start = reqs[i].start; - count = start + ndims; - stride = count + ndims; - } - else - start = count = stride = NULL; - - /* for record variable, each reqs[] is within a record */ - if (IS_RECVAR(lead->varp)) { - ndims--; - start++; - count++; - stride++; - } - if (fIsSet(lead->flag, NC_REQ_STRIDE_NULL)) stride = NULL; - - if (ndims < 0) continue; - if (ndims == 0) { /* 1D record variable */ - (*nsegs)++; - continue; - } - nseg = 1; - if (stride != NULL && stride[ndims-1] > 1) - nseg = count[ndims-1]; /* count of last dimension */ - for (j=0; joff = reqs[i].offset_start; + seg_ptr->len = reqs[i].nelems * lead->varp->xsz; + seg_ptr->buf_addr = addr; + if (prev_offset > seg_ptr->off) + is_incr = 0; /* offsets are not incrementing */ + else + prev_offset = seg_ptr->off; + seg_ptr++; + continue; + } + ndims = lead->varp->ndims; if (ndims > 0) { start = reqs[i].start; @@ -1476,15 +1454,18 @@ merge_requests(NC *ncp, addr, start, count, stride, &nseg, /* OUT: number of offset-length pairs */ seg_ptr); /* OUT: array of offset-length pairs */ + + /* check if (*segs)[].off are in an increasing order */ + for (j=0; j seg_ptr[j].off) + is_incr = 0; /* offsets are not incrementing */ + else + prev_offset = seg_ptr[j].off; + } seg_ptr += nseg; /* append the list to the end of segs array */ } - /* check if (*segs)[].off are in an increasing order */ - for (i=1; i<*nsegs; i++) { - if ((*segs)[i-1].off > (*segs)[i].off) - break; - } - if (i < *nsegs) /* not in an increasing order */ + if (!is_incr) /* not in an increasing order */ /* sort the off-len array, segs[], in an increasing order */ qsort(*segs, (size_t)(*nsegs), sizeof(off_len), off_compare); @@ -1751,8 +1732,7 @@ req_aggregation(NC *ncp, void *buf; /* point to starting buffer, used by MPI-IO call */ MPI_Aint b_begin, b_addr; MPI_Datatype filetype, buf_type, *ftypes, *btypes; - MPI_File fh; - MPI_Offset max_end, offset; + MPI_Offset max_end; if (num_reqs == 0) { /* only NC_REQ_COLL can reach here for 0 request */ assert(coll_indep == NC_REQ_COLL); @@ -2064,13 +2044,8 @@ req_aggregation(NC *ncp, } NCI_Free(reqs); - fh = ncp->independent_fh; - if (ncp->nprocs > 1 && coll_indep == NC_REQ_COLL) - fh = ncp->collective_fh; - /* set the MPI-IO fileview, this is a collective call */ - offset = 0; - err = ncmpio_file_set_view(ncp, fh, &offset, filetype); + err = ncmpio_file_set_view(ncp, 0, filetype, 0, NULL, NULL); if (filetype != MPI_BYTE) MPI_Type_free(&filetype); if (err != NC_NOERR) { if (status == NC_NOERR) status = err; @@ -2079,112 +2054,25 @@ req_aggregation(NC *ncp, } /* call MPI_File_read_at_all/MPI_File_write_at_all */ - err = ncmpio_read_write(ncp, rw_flag, coll_indep, offset, buf_len, buf_type, - buf, ((buf_type == MPI_BYTE) ? 1 : 0)); + // err = ncmpio_read_write(ncp, rw_flag, 0, buf_len, buf_type, buf); + +assert(0); +/* This subroutine is no longer used. + PNCIO_View buf_view; + err = ncmpio_read_write(ncp, rw_flag, 0, buf_view, buf); +*/ + if (status == NC_NOERR) status = err; if (buf_type != MPI_BYTE) MPI_Type_free(&buf_type); /* No longer need to reset the file view, as the root's fileview includes * the whole file header. - TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native", - MPI_INFO_NULL)); */ return status; } -/*----< calculate_access_range() >-------------------------------------------*/ -/* Returns the file offsets of access range of this request: starting file - * offset and end offset (exclusive). - * Note zero-length request should never call this subroutine. - */ -static int -calculate_access_range(const NC *ncp, - const NC_var *varp, - const MPI_Offset *start, /* [varp->ndims] */ - const MPI_Offset *count, /* [varp->ndims] */ - const MPI_Offset *stride, /* [varp->ndims] */ - MPI_Offset *start_off, /* OUT: start offset */ - MPI_Offset *end_off) /* OUT: end offset */ -{ - int i, ndims = varp->ndims; /* number of dimensions of this variable */ - - /* - * varp->dsizes[] is computed from right to left product of shape - * For example, a 3D array of size 5x4x3 in C order, - * For fixed-size variable: dsizes[0]=60 dsizes[1]=12 dsizes[2]=3 - * For record variable: dsizes[0]=12 dsizes[1]=12 dsizes[2]=3 - */ - if (IS_RECVAR(varp)) { - *start_off = 0; - *end_off = 0; - if (stride == NULL) { - if (ndims > 1) { - /* least significant dimension */ - *start_off = start[ndims-1]; - *end_off = start[ndims-1]+(count[ndims-1]-1); - /* the remaining dimensions */ - for (i=ndims-2; i>0; i--) { - *start_off += start[i]*varp->dsizes[i+1]; - *end_off += (start[i]+(count[i]-1))*varp->dsizes[i+1]; - } - } - *start_off *= varp->xsz; /* offset in bytes */ - *end_off *= varp->xsz; - /* handle the unlimited, most significant dimension */ - *start_off += start[0] * ncp->recsize; - *end_off += (start[0]+(count[0]-1)) * ncp->recsize; - } - else { - if (ndims > 1) { - /* least significant dimension */ - *start_off = start[ndims-1]; - *end_off = start[ndims-1]+(count[ndims-1]-1)*stride[ndims-1]; - /* the remaining dimensions */ - for (i=ndims-2; i>0; i--) { - *start_off += start[i]*varp->dsizes[i+1]; - *end_off += (start[i]+(count[i]-1)*stride[i]) * - varp->dsizes[i+1]; - } - } - *start_off *= varp->xsz; /* offset in bytes */ - *end_off *= varp->xsz; - /* handle the unlimited, most significant dimension */ - *start_off += start[0] * ncp->recsize; - *end_off += (start[0]+(count[0]-1)*stride[0]) * ncp->recsize; - } - } - else { - if (stride == NULL) { - /* first handle the least significant dimension */ - *start_off = start[ndims-1]; - *end_off = start[ndims-1] + (count[ndims-1]-1); - /* remaining dimensions till the most significant dimension */ - for (i=ndims-2; i>=0; i--) { - *start_off += start[i] * varp->dsizes[i+1]; - *end_off += (start[i]+(count[i]-1)) * varp->dsizes[i+1]; - } - } - else { - /* first handle the least significant dimension */ - *start_off = start[ndims-1]; - *end_off = start[ndims-1]+(count[ndims-1]-1)*stride[ndims-1]; - /* remaining dimensions till the most significant dimension */ - for (i=ndims-2; i>=0; i--) { - *start_off += start[i] * varp->dsizes[i+1]; - *end_off += (start[i]+(count[i]-1)*stride[i])*varp->dsizes[i+1]; - } - } - *start_off *= varp->xsz; /* offset in bytes */ - *end_off *= varp->xsz; - } - *start_off += varp->begin; /* beginning file offset of this variable */ - *end_off += varp->begin + varp->xsz; - - return NC_NOERR; -} - /*----< wait_getput() >------------------------------------------------------*/ static int wait_getput(NC *ncp, @@ -2210,8 +2098,17 @@ wait_getput(NC *ncp, varp = lead->varp; if (varp->ndims == 0) { /* scalar variable */ - reqs[i].offset_start = varp->begin; - reqs[i].offset_end = varp->begin + varp->xsz; + reqs[i].offset_start += varp->begin; + reqs[i].offset_end += varp->begin; + } + else if (reqs[i].npairs == 1) { /* only one offset-length pair */ + /* reqs[i].offset_end == reqs[i].nelems * varp->xsz */ + MPI_Offset off = varp->begin; + + if (IS_RECVAR(varp)) off += reqs[i].start[0] * ncp->recsize; + + reqs[i].offset_start += off; + reqs[i].offset_end += off; } else { /* start/count/stride have been allocated in a contiguous array */ @@ -2221,8 +2118,8 @@ wait_getput(NC *ncp, count + varp->ndims; /* calculate access range of this request */ - calculate_access_range(ncp, varp, reqs[i].start, count, stride, - &reqs[i].offset_start, &reqs[i].offset_end); + ncmpio_calc_start_end(ncp, varp, reqs[i].start, count, stride, + &reqs[i].offset_start, &reqs[i].offset_end); } if (i > 0) { /* check if offset_start are in a monotonic nondecreasing order */ @@ -2304,8 +2201,7 @@ mgetput(NC *ncp, void *buf=NULL; NC_lead_req *lead_list; MPI_Datatype filetype, buf_type=MPI_BYTE; - MPI_Offset offset=0, buf_count=0; - MPI_File fh; + MPI_Offset buf_count=0; #ifdef HAVE_MPI_LARGE_COUNT MPI_Count *blocklens; @@ -2489,12 +2385,8 @@ mgetput(NC *ncp, mpi_io: NCI_Free(reqs); - fh = ncp->independent_fh; - if (ncp->nprocs > 1 && coll_indep == NC_REQ_COLL) - fh = ncp->collective_fh; - /* set the MPI-IO fileview, this is a collective call */ - err = ncmpio_file_set_view(ncp, fh, &offset, filetype); + err = ncmpio_file_set_view(ncp, 0, filetype, 0, NULL, NULL); if (filetype != MPI_BYTE) MPI_Type_free(&filetype); if (err != NC_NOERR) { if (status == NC_NOERR) status = err; @@ -2503,17 +2395,19 @@ mgetput(NC *ncp, } /* call MPI_File_read_at_all/MPI_File_write_at_all */ - err = ncmpio_read_write(ncp, rw_flag, coll_indep, offset, buf_count, - buf_type, buf, ((buf_type == MPI_BYTE) ? 1 : 0)); + // err = ncmpio_read_write(ncp, rw_flag, 0, buf_count, buf_type, buf); + assert(0); + PNCIO_View buf_view; + err = ncmpio_read_write(ncp, rw_flag, 0, buf_view, buf); if (status == NC_NOERR) status = err; if (buf_type != MPI_BYTE) MPI_Type_free(&buf_type); /* No longer need to reset the file view, as the root's fileview includes * the whole file header. - TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native", - MPI_INFO_NULL)); */ return status; } +#endif + diff --git a/src/drivers/pncio/Makefile.am b/src/drivers/pncio/Makefile.am new file mode 100644 index 0000000000..f5527c8d5f --- /dev/null +++ b/src/drivers/pncio/Makefile.am @@ -0,0 +1,51 @@ +# +# Copyright (C) 2025, Northwestern University and Argonne National Laboratory +# See COPYRIGHT notice in top-level directory. +# +# @configure_input@ + +SUFFIXES = .a .o .c .m4 .h + +AM_CPPFLAGS = -I${top_srcdir}/src/include +AM_CPPFLAGS += -I${top_builddir}/src/include +AM_CPPFLAGS += -I${top_srcdir}/src/drivers/include +AM_CPPFLAGS += -I${top_builddir}/src/drivers/include + +if PNETCDF_DEBUG + AM_CPPFLAGS += -DPNETCDF_DEBUG +endif + +noinst_LTLIBRARIES = libpncio.la + +H_SRCS = pncio.h + +C_SRCS = pncio_read.c \ + pncio_write.c \ + pncio_open.c \ + pncio_close.c \ + pncio_fstype.c \ + pncio_aggregate.c \ + pncio_read_str.c \ + pncio_read_coll.c \ + pncio_read_str_naive.c \ + pncio_write_coll.c \ + pncio_write_str.c \ + pncio_write_str_naive.c \ + pncio_utils.c \ + pncio_lustre_open.c \ + pncio_lustre_wrcoll.c \ + pncio_lustre_wrstr.c \ + pncio_lock.c \ + pncio_set_size.c \ + pncio_sync.c \ + pncio_delete.c \ + pncio_set_view.c \ + pncio_hints.c + + +libpncio_la_SOURCES = $(C_SRCS) $(H_SRCS) + +CLEANFILES = core core.* *.gcda *.gcno *.gcov gmon.out + +tests-local: all + diff --git a/src/drivers/pncio/pncio.h b/src/drivers/pncio/pncio.h new file mode 100644 index 0000000000..267509d53c --- /dev/null +++ b/src/drivers/pncio/pncio.h @@ -0,0 +1,356 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifndef H_PNCIO +#define H_PNCIO + +#include +#include +#include +#include /* pwrite() */ + +#include +#include /* memcpy() */ +#include /* size_t */ +#include /* off_t */ +#include +#ifdef HAVE_LIMITS_H +#include +#endif +#ifdef HAVE_FCNTL_H +#include +#endif +#define FDTYPE int + +#include +#include +#include + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) +#define NMEASURES 8 +#endif + +#define PNCIO_LOCKS 300 /* file system supports fcntl()-style locking */ +#define PNCIO_Feature(a, b) ((b == PNCIO_LOCKS) ? 1 : 0) + +#if defined(F_SETLKW64) +#define PNCIO_UNLOCK(fd, offset, whence, len) \ + PNCIO_GEN_SetLock64(fd, F_SETLK, F_UNLCK, offset, whence, len) +#define PNCIO_WRITE_LOCK(fd, offset, whence, len) \ + PNCIO_GEN_SetLock64(fd, F_SETLKW, F_WRLCK, offset, whence, len) +#else +#define PNCIO_UNLOCK(fd, offset, whence, len) \ + PNCIO_GEN_SetLock(fd, F_SETLK, F_UNLCK, offset, whence, len) +#define PNCIO_WRITE_LOCK(fd, offset, whence, len) \ + PNCIO_GEN_SetLock(fd, F_SETLKW, F_WRLCK, offset, whence, len) +#endif + + +#define PNCIO_PERM 0666 /* file creation permission mask */ + +#define PNCIO_UFS 152 /* Unix file system */ +#define PNCIO_LUSTRE 163 /* Lustre */ +#define PNCIO_FSTYPE_MPIIO -1 /* Use MPI-IO */ +#define PNCIO_FSTYPE_CHECK 0 /* Use PnetCDF PNCIO drivers */ + +#define PNCIO_LUSTRE_MAX_OSTS 256 /* Maximum number of Lustre OSTs if hint + * striping_factor is not set by user. + */ + +#define PNCIO_CB_BUFFER_SIZE_DFLT "16777216" +#define PNCIO_IND_RD_BUFFER_SIZE_DFLT "4194304" +#define PNCIO_IND_WR_BUFFER_SIZE_DFLT "524288" +#define PNCIO_CB_CONFIG_LIST_DFLT "*:1" + +/* PNCIO_DS_WR_NPAIRS_LB is the lower bound of the total number of + * offset-length pairs over the non-aggregator senders to be received by an + * I/O aggregator to skip the potentially expensive heap-merge sort that + * determines whether or not data sieving write is necessary. + * PNCIO_DS_WR_NAGGRS_LB is the lower bound of the number of non-aggregators + * sending their offset-length pairs to an I/O aggregator. + * Both conditions must be met to skip the heap-merge sort. + * + * When data sieving is enabled, read-modify-write will perform at each round + * of two-phase I/O at each aggregator. The following describes whether + * detecting "holes" in a write region is necessary, depending on the data + * sieving hint, romio_ds_write, is set to enable/disable/automatic. + * + automatic - We need to check whether holes exist. If holes exist, the + * "read-modify" part must run. If not, "read-modify" can be skipped. + * + enable - "read-modify" part must perform, skip hole checking, and thus + * skip the heap-merge sort. + * + disable - "read-modify" part must skip, need not check holes, but must + * construct srt_off_len to merge all others_req[] into a single sorted + * list, which requires to call a heap-merge sort. This step is necessary + * because write data from all non-aggregators are received into the same + * write_buf, with a possibility of overlaps, and srt_off_len stores the + * coalesced offset-length pairs of individual non-contiguous write + * request and will be used to write them to the file. + * + * Heap-merge sort merges offset-length pairs received from all non-aggregators + * into a single list, which can be expensive. Its cost can be even larger than + * the cost of "read" in "read-modify-write". Below two constants are the lower + * bounds used to determine whether or not to perform such sorting, when data + * sieving is set to the automatic mode. + */ +#define PNCIO_DS_WR_NPAIRS_LB 8192 +#define PNCIO_DS_WR_NAGGRS_LB 256 +#define DO_HEAP_MERGE(nrecv, npairs) ((nrecv) > PNCIO_DS_WR_NAGGRS_LB || (npairs) > PNCIO_DS_WR_NPAIRS_LB) + +#define PNCIO_TYPE_DECREASE 0x00000001 /* if not monotonic nondecreasing */ +#define PNCIO_TYPE_OVERLAP 0x00000002 /* if contains overlapping regions */ +#define PNCIO_TYPE_NEGATIVE 0x00000004 /* if one of displacements is negative */ + +#define PNCIO_HINT_AUTO -1 +#define PNCIO_HINT_DISABLE 0 +#define PNCIO_HINT_ENABLE 1 + +#define PNCIO_STRIPING_AUTO -1 +#define PNCIO_STRIPING_INHERIT 0 + +typedef struct { + int nc_striping; + int striping_factor; + int striping_unit; + int start_iodevice; + int cb_nodes; + int cb_buffer_size; + int ind_rd_buffer_size; + int ind_wr_buffer_size; + + int romio_cb_read; + int romio_cb_write; + int romio_ds_read; + int romio_ds_write; + int romio_no_indep_rw; + + /* Hints for Lustre file system */ + int lustre_overstriping_ratio; + + /* Hints set by PnetCDF internally */ + int lustre_num_osts; + int *ranklist; + +} PNCIO_Hints; + +typedef struct { + MPI_Datatype type; /* MPI derived datatype */ + MPI_Offset size; /* total size in bytes (sum of len[*]) */ + MPI_Count count; /* number of off-len pairs */ +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset *off; /* [count] byte offsets */ + MPI_Offset *len; /* [count] block lengths in bytes */ +#else + MPI_Offset *off; /* [count] byte offsets */ + int *len; /* [count] block lengths in bytes */ +#endif + MPI_Count idx; /* index of off-len pairs consumed so far */ + MPI_Aint rem; /* remaining amount in the pair to be consumed */ + int is_contig; /* whether view of file or buffer is contiguous */ +} PNCIO_View; + +typedef struct { + MPI_Comm comm; /* communicator indicating who called open */ + const char *filename; + int file_system; /* type of file system */ + + int fd_sys; /* system file descriptor */ + PNCIO_node_ids node_ids;/* node IDs of each rank */ + int access_mode; /* Access mode (sequential, append, etc.), + * possibly modified to deal with + * data sieving or deferred open */ + + int is_open; /* no_indep_rw, 0: not open yet 1: is open */ + + int skip_read; /* whether to skip reads in read-modify-write */ + + MPI_Offset disp; /* file displacement */ + MPI_Datatype filetype; /* file type set in fileview */ + /* etype in fileview is always MPI_BYTE in PnetCDF */ + PNCIO_View flat_file; /* flattern filetype */ + + int atomicity; /* true=atomic, false=nonatomic */ + char *io_buf; /* two-phase buffer allocated out of i/o path */ + int is_agg; /* bool: if I am an aggregator */ + int my_cb_nodes_index; /* my index into fd->hints->ranklist[]. -1 if N/A */ + PNCIO_Hints *hints; /* structure containing fs-indep. info values */ + MPI_Info info; + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + double write_timing[NMEASURES]; + double read_timing[NMEASURES]; + MPI_Count write_counter[NMEASURES]; + MPI_Count read_counter[NMEASURES]; +#endif +} PNCIO_File; + +typedef struct { + MPI_Offset *offsets; /* array of offsets */ +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset *lens; /* array of lengths */ + MPI_Count *mem_ptrs; /* array of pointers. used in the read/write phase to + * indicate where the data is stored in memory + * promoted to MPI_Count so we can construct types + * with _c versions + */ + MPI_Count count; /* size of above arrays */ +#else + int *lens; + MPI_Aint *mem_ptrs; + size_t count; +#endif + size_t curr; /* index of offsets/lens that is currently being processed */ +} PNCIO_Access; + +/*---- APIs -----------------------------------------------------------------*/ +extern +int PNCIO_FileSysType(const char *filename); + +extern +int PNCIO_File_open(MPI_Comm comm, const char *filename, int amode, + MPI_Info info, PNCIO_File *fh); + +extern +int PNCIO_File_close(PNCIO_File *fh); + +extern +int PNCIO_File_set_view(PNCIO_File *fh, MPI_Offset disp, MPI_Datatype filetype, + MPI_Aint npairs, +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *offsets, MPI_Count *lengths +#else + MPI_Offset *offsets, int *lengths +#endif +); + +extern +int PNCIO_File_sync(PNCIO_File *fh); + +extern +int PNCIO_File_delete(const char *filename); + +extern +int PNCIO_File_set_size(PNCIO_File *fh, MPI_Offset size); + +extern +int PNCIO_File_get_size(PNCIO_File *fh, MPI_Offset *size); + +extern +int PNCIO_File_get_info(PNCIO_File *fh, MPI_Info *info_used); + +extern +int PNCIO_File_SetInfo(PNCIO_File *fh, MPI_Info users_info); + +/* PNC I/O APIs */ +extern +MPI_Offset PNCIO_File_write_at(PNCIO_File *fh, MPI_Offset offset, + const void *buf, PNCIO_View buf_view); +extern +MPI_Offset PNCIO_File_write_at_all(PNCIO_File *fh, MPI_Offset offset, + const void *buf, PNCIO_View buf_view); + +extern +MPI_Offset PNCIO_File_read_at(PNCIO_File *fh, MPI_Offset offset, void *buf, + PNCIO_View buf_view); +extern +MPI_Offset PNCIO_File_read_at_all(PNCIO_File *fh, MPI_Offset offset, void *buf, + PNCIO_View buf_view); + +extern +MPI_Offset PNCIO_WriteContig(PNCIO_File *fd, const void *buf, + MPI_Offset w_size, MPI_Offset offset); + +extern +MPI_Offset PNCIO_ReadContig(PNCIO_File *fd, void *buf, MPI_Offset r_size, + MPI_Offset offset); + +/* utility APIs */ +extern +void PNCIO_Calc_file_domains(MPI_Offset * st_offsets, + MPI_Offset *end_offsets, int nprocs, int nprocs_for_coll, + MPI_Offset *min_st_offset_ptr, MPI_Offset **fd_start_ptr, + MPI_Offset **fd_end_ptr, MPI_Offset *fd_size_ptr, + int striping_unit); + +extern +void PNCIO_Calc_my_req(PNCIO_File *fd, MPI_Offset min_st_offset, + const MPI_Offset *fd_end, MPI_Offset fd_size, + int nprocs, MPI_Count *count_my_req_procs_ptr, + MPI_Count **count_my_req_per_proc_ptr, + PNCIO_Access **my_req_ptr, MPI_Aint **buf_idx_ptr); + +extern +void PNCIO_Calc_others_req(PNCIO_File *fd, MPI_Count count_my_req_procs, + MPI_Count *count_my_req_per_proc, PNCIO_Access *my_req, + int nprocs, int myrank, MPI_Count *count_others_req_procs_ptr, + MPI_Count **count_others_req_per_proc_ptr, + PNCIO_Access **others_req_ptr); + +extern +void PNCIO_Free_my_req(MPI_Count *count_my_req_per_proc, + PNCIO_Access *my_req, MPI_Aint *buf_idx); + +extern +void PNCIO_Free_others_req(MPI_Count *count_others_req_per_proc, + PNCIO_Access *others_req); + + +extern +int PNCIO_Calc_aggregator(const PNCIO_File *fd, MPI_Offset off, MPI_Offset min_off, + MPI_Offset *len, MPI_Offset fd_size, const MPI_Offset *fd_end); + +extern +void PNCIO_Heap_merge(PNCIO_Access *others_req, MPI_Count *count, + MPI_Offset *srt_off, MPI_Count *srt_len, MPI_Count *start_pos, + int nprocs, int nprocs_recv, MPI_Count total_elements); + +/* Generic APIs */ +extern +int PNCIO_GEN_SetLock(PNCIO_File *fd, int cmd, int type, MPI_Offset offset, + int whence, MPI_Offset len); + +extern +int PNCIO_GEN_SetLock64(PNCIO_File *fd, int cmd, int type, MPI_Offset offset, + int whence, MPI_Offset len); + +extern +MPI_Offset PNCIO_GEN_WriteStrided(PNCIO_File *fd, const void *buf, + PNCIO_View buf_view, MPI_Offset offset); + +extern +MPI_Offset PNCIO_GEN_ReadStrided_naive(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, MPI_Offset offset); + +extern +MPI_Offset PNCIO_GEN_ReadStridedColl(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, MPI_Offset offset); + +extern +MPI_Offset PNCIO_GEN_WriteStrided_naive(PNCIO_File *fd, const void *buf, + PNCIO_View buf_view, MPI_Offset offset); + +extern +MPI_Offset PNCIO_GEN_ReadStrided(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, MPI_Offset offset); + +extern +MPI_Offset PNCIO_GEN_WriteStridedColl(PNCIO_File *fd, const void *buf, + PNCIO_View buf_view, MPI_Offset offset); + +/* Lustre */ +extern +int PNCIO_Lustre_create(PNCIO_File *fd, int access_mode); + +extern +int PNCIO_Lustre_open(PNCIO_File *fd); + +extern +MPI_Offset PNCIO_LUSTRE_WriteStrided(PNCIO_File *fd, const void *buf, + PNCIO_View buf_view, MPI_Offset offset); + +extern +MPI_Offset PNCIO_LUSTRE_WriteStridedColl(PNCIO_File *fd, const void *buf, + PNCIO_View buf_view, MPI_Offset offset); + +#endif diff --git a/src/drivers/pncio/pncio_aggregate.c b/src/drivers/pncio/pncio_aggregate.c new file mode 100644 index 0000000000..01542f627a --- /dev/null +++ b/src/drivers/pncio/pncio_aggregate.c @@ -0,0 +1,565 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + +/* This file contains four functions: + * + * PNCIO_Calc_aggregator() + * PNCIO_Calc_file_domains() + * PNCIO_Calc_my_req() + * PNCIO_Free_my_req() + * PNCIO_Calc_others_req() + * PNCIO_Free_others_req() + * + * The last three of these were originally in ad_read_coll.c, but they are + * also shared with ad_write_coll.c. I felt that they were better kept with + * the rest of the shared aggregation code. + */ + +/* Discussion of values available from above: + * + * MPI_Offset st_offsets[0..nprocs-1] + * MPI_Offset end_offsets[0..nprocs-1] + * These contain a list of start and end offsets for each process in + * the communicator. For example, an access at loc 10, size 10 would + * have a start offset of 10 and end offset of 19. + * int nprocs + * number of processors in the collective I/O communicator + * MPI_Offset min_st_offset + * MPI_Offset fd_start[0..nprocs_for_coll-1] + * starting location of "file domain"; region that a given process will + * perform aggregation for (i.e. actually do I/O) + * MPI_Offset fd_end[0..nprocs_for_coll-1] + * start + size - 1 roughly, but it can be less, or 0, in the case of + * uneven distributions + */ + +/* PNCIO_Calc_aggregator() + * + * The intention here is to implement a function which provides basically + * the same functionality as in Rajeev's original version of + * PNCIO_Calc_my_req(). He used a ceiling division approach to assign the + * file domains, and we use the same approach here when calculating the + * location of an offset/len in a specific file domain. Further we assume + * this same distribution when calculating the rank_index, which is later + * used to map to a specific process rank in charge of the file domain. + * + * A better (i.e. more general) approach would be to use the list of file + * domains only. This would be slower in the case where the + * original ceiling division was used, but it would allow for arbitrary + * distributions of regions to aggregators. We'd need to know the + * nprocs_for_coll in that case though, which we don't have now. + * + * Note a significant difference between this function and Rajeev's old code: + * this code doesn't necessarily return a rank in the range + * 0..nprocs_for_coll; instead you get something in 0..nprocs. This is a + * result of the rank mapping; any set of ranks in the communicator could be + * used now. + * + * Returns an integer representing a rank in the collective I/O communicator. + * + * The "len" parameter is also modified to indicate the amount of data + * actually available in this file domain. + */ +int PNCIO_Calc_aggregator(const PNCIO_File *fd, + MPI_Offset off, + MPI_Offset min_off, + MPI_Offset *len, /* may be modified when return */ + MPI_Offset fd_size, + const MPI_Offset *fd_end) +{ + int rank_index, rank; + MPI_Offset avail_bytes; + + /* get an index into our array of aggregators */ + rank_index = (int) ((off - min_off + fd_size) / fd_size - 1); + + if (fd->hints->striping_unit > 0) { + /* Implementation for file domain alignment. Note fd_end[] have been + * aligned with file system lock boundaries when it was produced by + * PNCIO_Calc_file_domains(). + */ + rank_index = 0; + while (off > fd_end[rank_index]) + rank_index++; + } + + /* we index into fd_end with rank_index, and fd_end was allocated to be no + * bigger than fd->hins->cb_nodes. If we ever violate that, we're + * overrunning arrays. Obviously, we should never ever hit this abort */ + if (rank_index >= fd->hints->cb_nodes || rank_index < 0) { + fprintf(stderr, + "Error in PNCIO_Calc_aggregator(): rank_index(%d) >= fd->hints->cb_nodes (%d) fd_size="OFFFMT" off="OFFFMT"\n", + rank_index, fd->hints->cb_nodes, fd_size, off); + MPI_Abort(MPI_COMM_WORLD, 1); + } + + /* remember here that even in Rajeev's original code it was the case that + * different aggregators could end up with different amounts of data to + * aggregate. here we use fd_end[] to make sure that we know how much + * data this aggregator is working with. + * + * the +1 is to take into account the end vs. length issue. + */ + avail_bytes = fd_end[rank_index] + 1 - off; + if (avail_bytes < *len) { + /* this file domain only has part of the requested contig. region */ + *len = avail_bytes; + } + + /* map our index to a rank */ + /* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */ + rank = fd->hints->ranklist[rank_index]; + + return rank; +} + +void PNCIO_Calc_file_domains(MPI_Offset *st_offsets, + MPI_Offset *end_offsets, + int nprocs, + int nprocs_for_coll, + MPI_Offset *min_st_offset_ptr, + MPI_Offset **fd_start_ptr, + MPI_Offset **fd_end_ptr, + MPI_Offset *fd_size_ptr, + int striping_unit) +{ +/* Divide the I/O workload among "nprocs_for_coll" processes. This is + done by (logically) dividing the file into file domains (FDs); each + process may directly access only its own file domain. */ + + MPI_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, fd_size; + int i; + +/* find min of start offsets and max of end offsets of all processes */ + + min_st_offset = st_offsets[0]; + max_end_offset = end_offsets[0]; + + for (i = 1; i < nprocs; i++) { + min_st_offset = MIN(min_st_offset, st_offsets[i]); + max_end_offset = MAX(max_end_offset, end_offsets[i]); + } + +/* determine the "file domain (FD)" of each process, i.e., the portion of + the file that will be "owned" by each process */ + +/* partition the total file access range equally among nprocs_for_coll + processes */ + fd_size = ((max_end_offset - min_st_offset + 1) + nprocs_for_coll - 1) / nprocs_for_coll; + /* ceiling division as in HPF block distribution */ + + *fd_start_ptr = (MPI_Offset *) NCI_Malloc(nprocs_for_coll * 2 * sizeof(MPI_Offset)); + *fd_end_ptr = *fd_start_ptr + nprocs_for_coll; + + fd_start = *fd_start_ptr; + fd_end = *fd_end_ptr; + + /* Wei-keng Liao: implementation for fild domain alignment to nearest file + * lock boundary (as specified by striping_unit hint). Could also + * experiment with other alignment strategies here */ + if (striping_unit > 0) { + MPI_Offset end_off; + int rem_front, rem_back; + + /* align fd_end[0] to the nearest file lock boundary */ + fd_start[0] = min_st_offset; + end_off = fd_start[0] + fd_size; + rem_front = end_off % striping_unit; + rem_back = striping_unit - rem_front; + if (rem_front < rem_back) + end_off -= rem_front; + else + end_off += rem_back; + fd_end[0] = end_off - 1; + + /* align fd_end[i] to the nearest file lock boundary */ + for (i = 1; i < nprocs_for_coll; i++) { + fd_start[i] = fd_end[i - 1] + 1; + end_off = min_st_offset + fd_size * (i + 1); + rem_front = end_off % striping_unit; + rem_back = striping_unit - rem_front; + if (rem_front < rem_back) + end_off -= rem_front; + else + end_off += rem_back; + fd_end[i] = end_off - 1; + } + fd_end[nprocs_for_coll - 1] = max_end_offset; + } else { /* no hints set: do things the 'old' way */ + fd_start[0] = min_st_offset; + fd_end[0] = min_st_offset + fd_size - 1; + + for (i = 1; i < nprocs_for_coll; i++) { + fd_start[i] = fd_end[i - 1] + 1; + fd_end[i] = fd_start[i] + fd_size - 1; + } + } + +/* take care of cases in which the total file access range is not + divisible by the number of processes. In such cases, the last + process, or the last few processes, may have unequal load (even 0). + For example, a range of 97 divided among 16 processes. + Note that the division is ceiling division. */ + + for (i = 0; i < nprocs_for_coll; i++) { + if (fd_start[i] > max_end_offset) + fd_start[i] = fd_end[i] = -1; + if (fd_end[i] > max_end_offset) + fd_end[i] = max_end_offset; + } + + *fd_size_ptr = fd_size; + *min_st_offset_ptr = min_st_offset; +} + + +/* PNCIO_Calc_my_req() - calculate what portions of the access requests + * of this process are located in the file domains of various processes + * (including this one) + */ +void PNCIO_Calc_my_req(PNCIO_File *fd, + MPI_Offset min_st_offset, + const MPI_Offset *fd_end, + MPI_Offset fd_size, + int nprocs, + MPI_Count *count_my_req_procs_ptr, + MPI_Count **count_my_req_per_proc_ptr, + PNCIO_Access **my_req_ptr, + MPI_Aint **buf_idx_ptr) +/* Possibly reconsider if buf_idx's are ok as int's, or should they be aints/offsets? + They are used as memory buffer indices so it seems like the 2G limit is in effect */ +{ + MPI_Count *count_my_req_per_proc, count_my_req_procs, l; + MPI_Aint *buf_idx; + int proc; + size_t memLen, alloc_sz; + MPI_Offset fd_len, rem_len, curr_idx, off, *off_ptr; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset *len_ptr; +#else + int *len_ptr; +#endif + PNCIO_Access *my_req; + + *count_my_req_per_proc_ptr = NCI_Calloc(nprocs, sizeof(MPI_Count)); + count_my_req_per_proc = *count_my_req_per_proc_ptr; +/* count_my_req_per_proc[i] gives the no. of contig. requests of this + process in process i's file domain. calloc initializes to zero. + I'm allocating memory of size nprocs, so that I can do an + MPI_Alltoall later on.*/ + + buf_idx = (MPI_Aint *) NCI_Malloc(nprocs * sizeof(MPI_Aint)); +/* buf_idx is relevant only if buftype_is_contig. + buf_idx[i] gives the index into user_buf where data received + from proc. i should be placed. This allows receives to be done + without extra buffer. This can't be done if buftype is not contig. */ + + /* initialize buf_idx to -1 */ + for (int i = 0; i < nprocs; i++) + buf_idx[i] = -1; + + /* fd->flat_file.count has been checked and adjusted to a possitive number + * at the beginning of PNCIO_GEN_ReadStridedColl() and + * PNCIO_GEN_WriteStridedColl(). + */ + assert(fd->flat_file.count > 0); + + /* one pass just to calculate how much space to allocate for my_req */ + for (MPI_Count i = 0; i < fd->flat_file.count; i++) { + /* short circuit offset/len processing if len == 0 + * (zero-byte read/write */ + if (fd->flat_file.len[i] == 0) + continue; + off = fd->flat_file.off[i]; + fd_len = fd->flat_file.len[i]; + /* note: we set fd_len to be the total size of the access. then + * PNCIO_Calc_aggregator() will modify the value to return the + * amount that was available from the file domain that holds the + * first part of the access. + */ + proc = PNCIO_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, fd_end); + count_my_req_per_proc[proc]++; + + /* figure out how much data is remaining in the access (i.e. wasn't + * part of the file domain that had the starting byte); we'll take + * care of this data (if there is any) in the while loop below. + */ + rem_len = fd->flat_file.len[i] - fd_len; + + while (rem_len != 0) { + off += fd_len; /* point to first remaining byte */ + fd_len = rem_len; /* save remaining size, pass to calc */ + proc = PNCIO_Calc_aggregator(fd, off, min_st_offset, &fd_len, + fd_size, fd_end); + + count_my_req_per_proc[proc]++; + rem_len -= fd_len; /* reduce remaining length by amount from fd */ + } + } + +/* now allocate space for my_req, offset, and len */ + + *my_req_ptr = (PNCIO_Access *) NCI_Malloc(nprocs * sizeof(PNCIO_Access)); + my_req = *my_req_ptr; + + /* combine offsets and lens into a single regions so we can make one + * exchange instead of two later on. Over-allocate the 'offsets' array and + * make 'lens' point to the over-allocated part + */ + memLen = 0; + for (int i = 0; i < nprocs; i++) + memLen += count_my_req_per_proc[i]; + +#ifdef HAVE_MPI_LARGE_COUNT + alloc_sz = sizeof(MPI_Offset) * 2; + my_req[0].offsets = (MPI_Offset *) NCI_Malloc(memLen * alloc_sz); + my_req[0].lens = my_req[0].offsets + memLen; +#else + alloc_sz = sizeof(MPI_Offset) + sizeof(int); + my_req[0].offsets = (MPI_Offset *) NCI_Malloc(memLen * alloc_sz); + my_req[0].lens = (int*) (my_req[0].offsets + memLen); +#endif + + off_ptr = my_req[0].offsets; + len_ptr = my_req[0].lens; + count_my_req_procs = 0; + for (int i = 0; i < nprocs; i++) { + if (count_my_req_per_proc[i]) { + my_req[i].offsets = off_ptr; + off_ptr += count_my_req_per_proc[i]; + my_req[i].lens = len_ptr; + len_ptr += count_my_req_per_proc[i]; + count_my_req_procs++; + } + my_req[i].count = 0; /* will be incremented where needed + * later */ + } + +/* now fill in my_req */ + curr_idx = 0; + for (MPI_Count i = 0; i < fd->flat_file.count; i++) { + /* short circuit offset/len processing if len == 0 + * (zero-byte read/write */ + if (fd->flat_file.len[i] == 0) + continue; + off = fd->flat_file.off[i]; + fd_len = fd->flat_file.len[i]; + proc = PNCIO_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, fd_end); + + /* for each separate contiguous access from this process */ + if (buf_idx[proc] == -1) { + assert(curr_idx == (MPI_Aint) curr_idx); + buf_idx[proc] = (MPI_Aint) curr_idx; + } + + l = my_req[proc].count; + curr_idx += fd_len; + + rem_len = fd->flat_file.len[i] - fd_len; + + /* store the proc, offset, and len information in an array + * of structures, my_req. Each structure contains the + * offsets and lengths located in that process's FD, + * and the associated count. + */ + my_req[proc].offsets[l] = off; + my_req[proc].lens[l] = fd_len; + my_req[proc].count++; + + while (rem_len != 0) { + off += fd_len; + fd_len = rem_len; + proc = PNCIO_Calc_aggregator(fd, off, min_st_offset, &fd_len, + fd_size, fd_end); + + if (buf_idx[proc] == -1) { + assert(curr_idx == (MPI_Aint) curr_idx); + buf_idx[proc] = (MPI_Aint) curr_idx; + } + + l = my_req[proc].count; + curr_idx += fd_len; + rem_len -= fd_len; + + my_req[proc].offsets[l] = off; + my_req[proc].lens[l] = fd_len; + my_req[proc].count++; + } + } + + *count_my_req_procs_ptr = count_my_req_procs; + *buf_idx_ptr = buf_idx; +} + +void PNCIO_Free_my_req(MPI_Count *count_my_req_per_proc, + PNCIO_Access *my_req, + MPI_Aint *buf_idx) +{ + NCI_Free(count_my_req_per_proc); + NCI_Free(my_req[0].offsets); + NCI_Free(my_req); + NCI_Free(buf_idx); +} + +void PNCIO_Calc_others_req(PNCIO_File *fd, + MPI_Count count_my_req_procs, + MPI_Count *count_my_req_per_proc, + PNCIO_Access *my_req, + int nprocs, + int myrank, + MPI_Count *count_others_req_procs_ptr, + MPI_Count **count_others_req_per_proc_ptr, + PNCIO_Access **others_req_ptr) +{ +/* determine what requests of other processes lie in this process's + file domain */ + +/* count_others_req_procs = number of processes whose requests lie in + this process's file domain (including this process itself) + count_others_req_per_proc[i] indicates how many separate contiguous + requests of proc. i lie in this process's file domain. */ + + MPI_Count *count_others_req_per_proc, count_others_req_procs; + size_t alloc_sz; + int i, j; + MPI_Request *requests; + PNCIO_Access *others_req; + size_t memLen; + MPI_Offset *off_ptr; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset *len_ptr; + MPI_Count *mem_ptr; +#else + int *len_ptr; + MPI_Aint *mem_ptr; +#endif + +/* first find out how much to send/recv and from/to whom */ + count_others_req_per_proc = NCI_Malloc(nprocs * sizeof(MPI_Count)); + + MPI_Alltoall(count_my_req_per_proc, 1, MPI_COUNT, + count_others_req_per_proc, 1, MPI_COUNT, fd->comm); + + *others_req_ptr = (PNCIO_Access *) NCI_Malloc(nprocs * sizeof(PNCIO_Access)); + others_req = *others_req_ptr; + + memLen = 0; + for (i = 0; i < nprocs; i++) + memLen += count_others_req_per_proc[i]; + +#ifdef HAVE_MPI_LARGE_COUNT + alloc_sz = sizeof(MPI_Offset) * 2 + sizeof(MPI_Count); + others_req[0].offsets = (MPI_Offset *) NCI_Malloc(memLen * alloc_sz); + others_req[0].lens = others_req[0].offsets + memLen; + others_req[0].mem_ptrs = (MPI_Count*) (others_req[0].lens + memLen); +#else + alloc_sz = sizeof(MPI_Offset) + sizeof(int) + sizeof(MPI_Aint); + others_req[0].offsets = (MPI_Offset *) NCI_Malloc(memLen * alloc_sz); + others_req[0].lens = (int *) (others_req[0].offsets + memLen); + others_req[0].mem_ptrs = (MPI_Aint*) (others_req[0].lens + memLen); +#endif + off_ptr = others_req[0].offsets; + len_ptr = others_req[0].lens; + mem_ptr = others_req[0].mem_ptrs; + + count_others_req_procs = 0; + for (i = 0; i < nprocs; i++) { + if (count_others_req_per_proc[i]) { + others_req[i].count = count_others_req_per_proc[i]; + others_req[i].offsets = off_ptr; + off_ptr += count_others_req_per_proc[i]; + others_req[i].lens = len_ptr; + len_ptr += count_others_req_per_proc[i]; + others_req[i].mem_ptrs = mem_ptr; + mem_ptr += count_others_req_per_proc[i]; + count_others_req_procs++; + } else + others_req[i].count = 0; + } + *count_others_req_per_proc_ptr = count_others_req_per_proc; + +/* now send the calculated offsets and lengths to respective processes */ + + requests = (MPI_Request *) + NCI_Malloc((count_my_req_procs + count_others_req_procs) * 2 * sizeof(MPI_Request)); + + j = 0; + for (i = 0; i < nprocs; i++) { + if (others_req[i].count == 0) + continue; + if (i == myrank) { + /* send to self uses memcpy()C, here others_req[i].count == my_req[i].count */ + memcpy(others_req[i].offsets, my_req[i].offsets, + my_req[i].count * sizeof(MPI_Offset)); +#ifdef HAVE_MPI_LARGE_COUNT + memcpy(others_req[i].lens, my_req[i].lens, + my_req[i].count * sizeof(MPI_Offset)); +#else + memcpy(others_req[i].lens, my_req[i].lens, + my_req[i].count * sizeof(int)); +#endif + } + else { +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Irecv_c(others_req[i].offsets, others_req[i].count, + MPI_OFFSET, i, i + myrank, fd->comm, &requests[j++]); + MPI_Irecv_c(others_req[i].lens, others_req[i].count, + MPI_OFFSET, i, i + myrank, fd->comm, &requests[j++]); +#else + assert(others_req[i].count <= 2147483647); /* overflow 4-byte int */ + MPI_Irecv(others_req[i].offsets, (int)others_req[i].count, + MPI_OFFSET, i, i + myrank, fd->comm, &requests[j++]); + MPI_Irecv(others_req[i].lens, (int)others_req[i].count, + MPI_INT, i, i + myrank, fd->comm, &requests[j++]); +#endif + } + } + + for (i = 0; i < nprocs; i++) { + if (my_req[i].count && i != myrank) { +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Isend_c(my_req[i].offsets, my_req[i].count, + MPI_OFFSET, i, i + myrank, fd->comm, &requests[j++]); + MPI_Isend_c(my_req[i].lens, my_req[i].count, + MPI_OFFSET, i, i + myrank, fd->comm, &requests[j++]); +#else + assert(my_req[i].count <= 2147483647); /* overflow 4-byte int */ + MPI_Isend(my_req[i].offsets, (int)my_req[i].count, + MPI_OFFSET, i, i + myrank, fd->comm, &requests[j++]); + MPI_Isend(my_req[i].lens, (int)my_req[i].count, + MPI_INT, i, i + myrank, fd->comm, &requests[j++]); +#endif + } + } + + if (j) { +#ifdef HAVE_MPI_STATUSES_IGNORE + MPI_Waitall(j, requests, MPI_STATUSES_IGNORE); +#else + MPI_Status *statuses = (MPI_Status *) NCI_Malloc(j * sizeof(MPI_Status)); + MPI_Waitall(j, requests, statuses); + NCI_Free(statuses); +#endif + } + + NCI_Free(requests); + + *count_others_req_procs_ptr = count_others_req_procs; +} + +void PNCIO_Free_others_req(MPI_Count *count_others_req_per_proc, + PNCIO_Access *others_req) +{ + NCI_Free(count_others_req_per_proc); + NCI_Free(others_req[0].offsets); + NCI_Free(others_req); +} + diff --git a/src/drivers/pncio/pncio_close.c b/src/drivers/pncio/pncio_close.c new file mode 100644 index 0000000000..363e41cf3f --- /dev/null +++ b/src/drivers/pncio/pncio_close.c @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include /* strdup() */ +#include +#include + +#include + +#include "pncio.h" + +/*----< PNCIO_File_close() >--------------------------------------------------*/ +int PNCIO_File_close(PNCIO_File *fh) +{ + int err = NC_NOERR; + + if (fh->is_open) { + err = close(fh->fd_sys); + if (err != 0) + err = ncmpii_error_posix2nc("close"); + } + + if (fh->hints->ranklist != NULL) + NCI_Free(fh->hints->ranklist); + if (fh->hints != NULL) + NCI_Free(fh->hints); + if (fh->info != MPI_INFO_NULL) + MPI_Info_free(&(fh->info)); + if (fh->io_buf != NULL) + NCI_Free(fh->io_buf); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + int i, world_rank; + double timing[NMEASURES*2], max_t[NMEASURES*2], pread_t; + MPI_Count max_ntimes, counter[NMEASURES*2], max_c[NMEASURES*2]; + + /* print two-phase I/O timing breakdown */ + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + for (i=0; iwrite_timing[i]; + counter[i] = fh->write_counter[i]; + timing[i+NMEASURES] = fh->read_timing[i]; + counter[i+NMEASURES] = fh->read_counter[i]; + } + MPI_Reduce(timing, max_t, NMEASURES*2, MPI_DOUBLE, MPI_MAX, 0, fh->comm); + MPI_Reduce(counter, max_c, NMEASURES*2, MPI_COUNT, MPI_MAX, 0, fh->comm); + + pread_t = max_t[NMEASURES+2]; + max_ntimes = max_c[0]; + + if (world_rank == 0 && max_ntimes > 0) { + printf("%s: TWO-PHASE write init %5.2f pwrite %5.2f pread %5.2f post %5.2f hsort %5.2f comm %5.2f collw %5.2f\n", + __func__, max_t[1], max_t[2], pread_t, max_t[4], max_t[5], max_t[3], max_t[0]); + printf("%s: TWO-PHASE write ntimes %lld check_hole %lld (total_num %lld nrecv %lld) no check %lld (total_num %lld nrecv %lld)\n", + __func__, max_c[0], max_c[1], max_c[2], max_c[3], max_c[4], max_c[5], max_c[6]); + } + + max_ntimes = max_c[NMEASURES]; + + if (world_rank == 0 && max_ntimes > 0) + printf("%s: TWO-PHASE read init %5.2f pread %5.2f post %5.2f wait %5.2f collr %5.2f ntimes %lld\n", + __func__, max_t[NMEASURES+1], max_t[NMEASURES+2], max_t[NMEASURES+4], max_t[NMEASURES+3], max_t[NMEASURES+0], max_ntimes); +#endif + + return err; +} diff --git a/src/drivers/pncio/pncio_delete.c b/src/drivers/pncio/pncio_delete.c new file mode 100644 index 0000000000..514f3a3253 --- /dev/null +++ b/src/drivers/pncio/pncio_delete.c @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#ifdef HAVE_UNISTD_H +#include /* unlink() */ +#endif + +#include +#include "pncio.h" + +/*----< PNCIO_File_delete() >-------------------------------------------------*/ +int PNCIO_File_delete(const char *filename) +{ + int err = NC_NOERR; + char *path = ncmpii_remove_file_system_type_prefix(filename); + + err = unlink(path); + if (err != 0) + err = ncmpii_error_posix2nc("unlink"); + + return err; +} + diff --git a/src/drivers/pncio/pncio_fstype.c b/src/drivers/pncio/pncio_fstype.c new file mode 100644 index 0000000000..9713b70114 --- /dev/null +++ b/src/drivers/pncio/pncio_fstype.c @@ -0,0 +1,233 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include /* readlink() */ +#include /* strdup() */ +#include +#include +#include /* open(), O_CREAT */ +#include /* open() */ +#include /* basename() */ + +#ifdef HAVE_LIMITS_H +#include +#endif +#ifndef PATH_MAX +#define PATH_MAX 65535 +#endif + +#ifdef HAVE_SYS_VFS_H +#include +#endif +#ifdef HAVE_SYS_STATVFS_H +#include +#endif +#ifdef HAVE_SYS_PARAM_H +#include /* struct statfs */ +#endif +#ifdef HAVE_SYS_MOUNT_H +#include /* struct statfs */ +#endif +#ifdef HAVE_SYS_STAT_H +#include /* open(), fstat(), lstat(), stat() */ +#endif + +#include + +#include "pncio.h" + +/* In a strict ANSI environment, S_ISLNK may not be defined. Fix that here. + * We assume that S_ISLNK is *always* defined as a macro. If that is not + * universally true, then add a test to the configure that tries to link + * a program that references S_ISLNK + */ +#if !defined(S_ISLNK) +#if defined(S_IFLNK) +/* Check for the link bit */ +#define S_ISLNK(mode) ((mode) & S_IFLNK) +#else +/* no way to check if it is a link, so say false */ +#define S_ISLNK(mode) 0 +#endif +#endif /* !(S_ISLNK) */ + +/* Returns a string, the parent directory of a given filename. + * The caller should free the memory located returned by this subroutine. + */ +static +void parentdir(const char *filename, char **dirnamep) +{ + int err; + char *dir = NULL, *slash; + struct stat statbuf; + + err = lstat(filename, &statbuf); + + if (err || (!S_ISLNK(statbuf.st_mode))) { + /* No such file, or file is not a link; these are the "normal" cases + * where we can just return the parent directory. + */ + dir = NCI_Strdup(filename); + } else { + /* filename is a symlink. We've presumably already tried to stat it + * and found it to be missing (dangling link), but this code doesn't + * care if the target is really there or not. + */ + ssize_t namelen; + char *linkbuf; + + linkbuf = NCI_Malloc(PATH_MAX + 1); + namelen = readlink(filename, linkbuf, PATH_MAX + 1); + if (namelen == -1) { + /* Something strange has happened between the time that we + * determined that this was a link and the time that we attempted + * to read it; punt and use the old name. + */ + dir = NCI_Strdup(filename); + } else { + /* successfully read the link */ + linkbuf[namelen] = '\0'; /* readlink doesn't null terminate */ + dir = NCI_Strdup(linkbuf); + } + NCI_Free(linkbuf); + } + + slash = strrchr(dir, '/'); + if (!slash) + strncpy(dir, ".", 2); + else { + if (slash == dir) + *(dir + 1) = '\0'; + else + *slash = '\0'; + } + + *dirnamep = dir; + return; +} + +#define UNKNOWN_SUPER_MAGIC (0xDEADBEEF) +#ifndef LL_SUPER_MAGIC +#define LL_SUPER_MAGIC 0x0BD00BD0 +#endif + +static int check_statfs(const char *filename, int64_t * file_id) +{ + int err = 0; + +#ifdef HAVE_STRUCT_STATVFS_WITH_F_BASETYPE + /* rare: old solaris machines */ + struct statvfs vfsbuf; +#endif +#if defined(HAVE_STRUCT_STATFS_F_TYPE) || defined(HAVE_STRUCT_STATFS_F_FSTYPENAME) + /* common fs-detection logic for any modern POSIX-compliant environment, + * with the one wrinkle that some platforms (Darwin, BSD) give us a file + * system as a string, not an identifier */ + struct statfs fsbuf; +#endif + + *file_id = UNKNOWN_SUPER_MAGIC; + +#ifdef HAVE_STRUCT_STATVFS_WITH_F_BASETYPE + err = statvfs(filename, &vfsbuf); + if (err == 0) + *file_id = vfsbuf.f_basetype; +#endif + + /* remember above how I said 'statfs with f_type' was the common linux-y + * way to report file system type? Darwin (and probably the BSDs) *also* + * uses f_type but it is "reserved" and does not give us anything + * meaningful. Fine. If configure detects f_type we'll use it here and on + * those "reserved" platforms we'll ignore that result and check the + * f_fstypename field. + */ + +#ifdef HAVE_STRUCT_STATFS_F_TYPE + err = statfs(filename, &fsbuf); + if (err == 0) { + *file_id = fsbuf.f_type; + return 0; + } +#endif + +#ifdef HAVE_STRUCT_STATFS_F_FSTYPENAME + /* these stat routines store the file system type in a string */ + err = statfs(filename, &fsbuf); + if (err == 0 && !strncasecmp(fsbuf.f_fstypename, "lustre", 6)) { + *file_id = LL_SUPER_MAGIC; + return 0; + } +#endif + +#ifdef HAVE_STRUCT_STAT_ST_FSTYPE + struct stat sbuf; + err = stat(filename, &sbuf); + if (err == 0) { + *file_id = sbuf.st_fstype; + return 0; + } +#endif + return err; +} + +/* Check if file system type from file name, using a system-dependent function + * call. + */ +int PNCIO_FileSysType(const char *filename) +{ + + int err, retry_cnt; + int64_t file_id=UNKNOWN_SUPER_MAGIC; + + char *colon = strchr(filename, ':'); + if (colon != NULL) { /* there is a prefix end with : */ + if (!strncmp(filename, "lustre", 6)) + return PNCIO_LUSTRE; + else if (!strncmp(filename, "ufs", 3)) + return PNCIO_UFS; + else + return 0; + } +#ifdef MIMIC_LUSTRE + return PNCIO_LUSTRE; +#endif + + /* NFS can get stuck and end up returning ESTALE "forever" */ + +#define MAX_ESTALE_RETRY 10000 + + retry_cnt = 0; + do { + err = check_statfs(filename, &file_id); + } while (err && (errno == ESTALE) && retry_cnt++ < MAX_ESTALE_RETRY); + + if (err) { + /* ENOENT may be returned in two cases: + * 1) no directory entry for "filename" + * 2) "filename" is a dangling symbolic link + * + * parentdir() tries to deal with both cases. + */ + if (errno == ENOENT) { + char *dir; + parentdir(filename, &dir); + err = check_statfs(dir, &file_id); + NCI_Free(dir); + } else + return 0; + } + + if (file_id == LL_SUPER_MAGIC) + return PNCIO_LUSTRE; + else + return PNCIO_UFS; /* UFS support if we don't know what else to use */ +} + diff --git a/src/drivers/pncio/pncio_hints.c b/src/drivers/pncio/pncio_hints.c new file mode 100644 index 0000000000..5272c620eb --- /dev/null +++ b/src/drivers/pncio/pncio_hints.c @@ -0,0 +1,275 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include "pncio.h" + +#define GET_INFO_INT(key) { \ + MPI_Info_get(users_info, #key, MPI_MAX_INFO_VAL, value, &flag); \ + if (flag) { \ + MPI_Info_set(fd->info, #key, value); \ + fd->hints->key = atoi(value); \ + } \ +} + +#define GET_INFO_STR(key) { \ + MPI_Info_get(users_info, #key, MPI_MAX_INFO_VAL, value, &flag); \ + if (flag) { \ + MPI_Info_set(fd->info, #key, value); \ + if (!strcasecmp(value, "true")) \ + fd->hints->key = PNCIO_HINT_ENABLE; \ + else if (!strcasecmp(value, "false")) \ + fd->hints->key = PNCIO_HINT_DISABLE; \ + else if (!strcasecmp(value, "automatic")) \ + fd->hints->key = PNCIO_HINT_AUTO; \ + else if (!strcasecmp(value, "enable")) \ + fd->hints->key = PNCIO_HINT_ENABLE; \ + else if (!strcasecmp(value, "disable")) \ + fd->hints->key = PNCIO_HINT_DISABLE; \ + else if (!strcasecmp(value, "inherit")) \ + fd->hints->key = PNCIO_STRIPING_INHERIT; \ + } \ +} + +#ifdef PNETCDF_DEBUG +#define CHECK_HINT(hint) { \ + if (fd->hints->hint != root_hints->hint) { \ + char int_str[16]; \ + fprintf(stderr, "Error: inconsistent I/O hint %s (%d at rank %d, %d at root)\n", \ + #hint, fd->hints->hint, rank, root_hints->hint); \ + /* overwrite local's hint with root's */ \ + snprintf(int_str, 16, "%d", root_hints->hint); \ + MPI_Info_set(fd->info, #hint, int_str); \ + err = NC_EMULTIDEFINE_HINTS; \ + } \ +} +#else +#define CHECK_HINT(hint) { \ + if (fd->hints->hint != root_hints->hint) { \ + /* overwrite local's hint with root's */ \ + char int_str[16]; \ + snprintf(int_str, 16, "%d", root_hints->hint); \ + MPI_Info_set(fd->info, #hint, int_str); \ + err = NC_EMULTIDEFINE_HINTS; \ + } \ +} +#endif + +/*----< hint_consistency_check() >-------------------------------------------*/ +static +int hint_consistency_check(PNCIO_File *fd) +{ + int err, rank; + + MPI_Comm_rank(fd->comm, &rank); + + err = NC_NOERR; + + if (rank == 0) + /* broadcast root's hints */ + MPI_Bcast(fd->hints, sizeof(PNCIO_Hints), MPI_BYTE, 0, fd->comm); + else { + PNCIO_Hints *root_hints; + root_hints = (PNCIO_Hints*) NCI_Malloc(sizeof(PNCIO_Hints)); + + /* broadcast root's hints */ + MPI_Bcast(root_hints, sizeof(PNCIO_Hints), MPI_BYTE, 0, fd->comm); + + /* check hints individually against root's */ + CHECK_HINT(nc_striping); + CHECK_HINT(striping_factor); + CHECK_HINT(striping_unit); + CHECK_HINT(start_iodevice); + CHECK_HINT(cb_nodes); + CHECK_HINT(cb_buffer_size); + CHECK_HINT(ind_rd_buffer_size); + CHECK_HINT(ind_wr_buffer_size); + + CHECK_HINT(romio_cb_read); + CHECK_HINT(romio_cb_write); + CHECK_HINT(romio_ds_read); + CHECK_HINT(romio_ds_write); + CHECK_HINT(romio_no_indep_rw); + + CHECK_HINT(lustre_overstriping_ratio); + + NCI_Free(root_hints); + } + + /* All NetCDF erro codes are negative */ + MPI_Allreduce(MPI_IN_PLACE, &err, 1, MPI_INT, MPI_MIN, fd->comm); + + return err; +} + +/*----< PNCIO_File_SetInfo() >------------------------------------------------*/ +/* For PnetCDF, a file info object can only be passed to PnetCDF at file create + * or open call, i.e. I/O hints cannot be changed after file create/open. + * + * This subroutine is a collective call, because it checks consistency of all + * hints among all processes. + */ +int +PNCIO_File_SetInfo(PNCIO_File *fd, + MPI_Info users_info) +{ + int err=NC_NOERR, flag, nprocs; + char value[MPI_MAX_INFO_VAL + 1]; + + if (users_info == MPI_INFO_NULL) + MPI_Info_create(&fd->info); + else + MPI_Info_dup(users_info, &fd->info); + + MPI_Comm_size(fd->comm, &nprocs); + + /* initialize fd->info and hints to default values */ + + /* buffer size for collective I/O */ + MPI_Info_set(fd->info, "cb_buffer_size", PNCIO_CB_BUFFER_SIZE_DFLT); + fd->hints->cb_buffer_size = atoi(PNCIO_CB_BUFFER_SIZE_DFLT); + + /* default is to let pncio automatically decide whether or not to use + * collective buffering + */ + MPI_Info_set(fd->info, "romio_cb_read", "automatic"); + fd->hints->romio_cb_read = PNCIO_HINT_AUTO; + MPI_Info_set(fd->info, "romio_cb_write", "automatic"); + fd->hints->romio_cb_write = PNCIO_HINT_AUTO; + + /* cb_nodes may be set later right after file open call */ + fd->hints->cb_nodes = 0; + + /* hint indicating that no indep. I/O will be performed on this file */ + MPI_Info_set(fd->info, "romio_no_indep_rw", "false"); + fd->hints->romio_no_indep_rw = 0; + + /* buffer size for data sieving in independent reads */ + MPI_Info_set(fd->info, "ind_rd_buffer_size", PNCIO_IND_RD_BUFFER_SIZE_DFLT); + fd->hints->ind_rd_buffer_size = atoi(PNCIO_IND_RD_BUFFER_SIZE_DFLT); + + /* buffer size for data sieving in independent writes */ + MPI_Info_set(fd->info, "ind_wr_buffer_size", PNCIO_IND_WR_BUFFER_SIZE_DFLT); + fd->hints->ind_wr_buffer_size = atoi(PNCIO_IND_WR_BUFFER_SIZE_DFLT); + + /* default is to let romio automatically decide when to use data + * sieving + */ + MPI_Info_set(fd->info, "romio_ds_read", "automatic"); + fd->hints->romio_ds_read = PNCIO_HINT_AUTO; + MPI_Info_set(fd->info, "romio_ds_write", "automatic"); + fd->hints->romio_ds_write = PNCIO_HINT_AUTO; + + /* File striping parameters will be retrieved from the file system set, + * once the file is opened. These parameters can also be customized by + * a user's info. Thus, default values used below are to indicate + * whether or not they have been customized by the users. + */ + fd->hints->nc_striping = PNCIO_STRIPING_AUTO; + fd->hints->striping_unit = 0; + fd->hints->striping_factor = 0; + fd->hints->start_iodevice = -1; + /* Lustre overstriping ratio. 0 or 1 means disabled */ + fd->hints->lustre_overstriping_ratio = 1; + + /* add in user's info --------------------------------------------------*/ + + if (users_info == MPI_INFO_NULL) goto err_out; + + /* size of internal buffer to be used in collective reads and writes */ + GET_INFO_INT(cb_buffer_size); + + /* enable/disable collective buffering */ + GET_INFO_STR(romio_cb_read); + if (fd->hints->romio_cb_read == PNCIO_HINT_DISABLE) { + /* romio_cb_read overrides romio_no_indep_rw */ + MPI_Info_set(fd->info, "romio_no_indep_rw", "false"); + fd->hints->romio_no_indep_rw = PNCIO_HINT_DISABLE; + } + + GET_INFO_STR(romio_cb_write); + if (fd->hints->romio_cb_write == PNCIO_HINT_DISABLE) { + /* romio_cb_write overrides romio_no_indep_rw */ + MPI_Info_set(fd->info, "romio_no_indep_rw", "false"); + fd->hints->romio_no_indep_rw = PNCIO_HINT_DISABLE; + } + + /* user intends to call collective I/O APIs only */ + GET_INFO_STR(romio_no_indep_rw); + if (fd->hints->romio_no_indep_rw == PNCIO_HINT_ENABLE) { + MPI_Info_set(fd->info, "romio_cb_write", "enable"); + MPI_Info_set(fd->info, "romio_cb_read", "enable"); + fd->hints->romio_cb_read = PNCIO_HINT_ENABLE; + fd->hints->romio_cb_write = PNCIO_HINT_ENABLE; + } + + /* enable/disable data sieving */ + GET_INFO_STR(romio_ds_read); + GET_INFO_STR(romio_ds_write); + + /* number of I/O aggregators */ + GET_INFO_INT(cb_nodes); + /* check ill value */ + if (fd->hints->cb_nodes > 0 && fd->hints->cb_nodes <= nprocs) { + snprintf(value, MPI_MAX_INFO_VAL + 1, "%d", fd->hints->cb_nodes); + MPI_Info_set(fd->info, "cb_nodes", value); + } + else { + fd->hints->cb_nodes = 0; + MPI_Info_set(fd->info, "cb_nodes", "0"); + } + + GET_INFO_INT(ind_wr_buffer_size); + GET_INFO_INT(ind_rd_buffer_size); + + /* file striping configuration */ + GET_INFO_STR(nc_striping); + GET_INFO_INT(striping_unit); + GET_INFO_INT(striping_factor); + GET_INFO_INT(start_iodevice); + + /* Lustre overstriping ratio. 0 or 1 means disabled */ + GET_INFO_INT(lustre_overstriping_ratio); + + /* Check hint consistency among all processes */ +err_out: + err = hint_consistency_check(fd); + + /* PnetCDF ignores the following hints. + * cb_config_list + * deferred_open + */ + + return err; +} + +/*----< PNCIO_File_get_info() >-----------------------------------------------*/ +int PNCIO_File_get_info(PNCIO_File *fd, + MPI_Info *info_used) +{ + int err; + + err = MPI_Info_dup(fd->info, info_used); + if (err == MPI_SUCCESS) + err = NC_NOERR; + else + err = ncmpii_error_mpi2nc(err, "MPI_Info_dup"); + + return err; +} + diff --git a/src/drivers/pncio/pncio_lock.c b/src/drivers/pncio/pncio_lock.c new file mode 100644 index 0000000000..a78d181dbe --- /dev/null +++ b/src/drivers/pncio/pncio_lock.c @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + +#include + +static +const char *GEN_flock_cmd_to_string(int cmd) +{ + switch (cmd) { +#ifdef F_GETLK64 + case F_GETLK64: + return "F_GETLK64"; +#else + case F_GETLK: + return "F_GETLK"; +#endif +#ifdef F_SETLK64 + case F_SETLK64: + return "F_SETLK64"; +#else + case F_SETLK: + return "F_SETLK"; +#endif +#ifdef F_SETLKW64 + case F_SETLKW64: + return "F_SETLKW64"; +#else + case F_SETLKW: + return "F_SETLKW"; +#endif + default: + return "UNEXPECTED"; + } +} + +static +const char *GEN_flock_type_to_string(int type) +{ + switch (type) { + case F_RDLCK: + return "F_RDLCK"; + case F_WRLCK: + return "F_WRLCK"; + case F_UNLCK: + return "F_UNLOCK"; + default: + return "UNEXPECTED"; + } +} + +int PNCIO_GEN_SetLock(PNCIO_File *fd, int cmd, int type, MPI_Offset offset, int whence, + MPI_Offset len) +{ + FDTYPE fd_sys = fd->fd_sys; + int err, error_code, err_count = 0, sav_errno; + struct flock lock; + + if (len == 0) + return MPI_SUCCESS; + + + /* Depending on the compiler flags and options, struct flock + * may not be defined with types that are the same size as + * MPI_Offsets. */ +/* FIXME: This is a temporary hack until we use flock64 where + available. It also doesn't fix the broken Solaris header sys/types.h + header file, which declares off_t as a UNION ! Configure tests to + see if the off64_t is a union if large file support is requested; + if so, it does not select large file support. +*/ +#ifdef NEEDS_INT_CAST_WITH_FLOCK + lock.l_type = type; + lock.l_start = (int) offset; + lock.l_whence = whence; + lock.l_len = (int) len; +#else + lock.l_type = type; + lock.l_whence = whence; + lock.l_start = offset; + lock.l_len = len; +#endif + + sav_errno = errno; /* save previous errno in case we recover from retryable errors */ + errno = 0; + do { + err = fcntl(fd_sys, cmd, &lock); + } while (err && ((errno == EINTR) || ((errno == EINPROGRESS) && (++err_count < 10000)))); + + if (err && (errno != EBADF)) { + /* FIXME: This should use the error message system, + * especially for MPICH */ + fprintf(stderr, + "This requires fcntl(2) to be implemented. As of 8/25/2011 it is not. Generic MPICH Message: File locking failed in PNCIO_GEN_SetLock(fd %X,cmd %s/%X,type %s/%X,whence %X) with return value %X and errno %X.\n" + "- If the file system is NFS, you need to use NFS version 3, ensure that the lockd daemon is running on all the machines, and mount the directory with the 'noac' option (no attribute caching).\n" + "- If the file system is LUSTRE, ensure that the directory is mounted with the 'flock' option.\n", + fd_sys, GEN_flock_cmd_to_string(cmd), cmd, + GEN_flock_type_to_string(type), type, whence, err, errno); + perror("PNCIO_GEN_SetLock:"); + fprintf(stderr, "PNCIO_GEN_SetLock:offset %llu, length %llu\n", (unsigned long long) offset, + (unsigned long long) len); + MPI_Abort(MPI_COMM_WORLD, 1); + } + + if (!err) /* report fcntl failure errno's (EBADF), otherwise */ + errno = sav_errno; /* restore previous errno in case we recovered from retryable errors */ + + error_code = (err == 0) ? MPI_SUCCESS : MPI_ERR_UNKNOWN; + return error_code; +} + +int PNCIO_GEN_SetLock64(PNCIO_File *fd, int cmd, int type, MPI_Offset offset, int whence, + MPI_Offset len) +{ + FDTYPE fd_sys = fd->fd_sys; + int err, error_code; +#ifdef _LARGEFILE64_SOURCE + struct flock64 lock; +#else + struct flock lock; +#endif + + if (len == 0) + return MPI_SUCCESS; + + lock.l_type = type; + lock.l_start = offset; + lock.l_whence = whence; + lock.l_len = len; + + do { + err = fcntl(fd_sys, cmd, &lock); + } while (err && (errno == EINTR)); + + if (err && (errno != EBADF)) { + fprintf(stderr, + "File locking failed in PNCIO_GEN_SetLock64(fd %X,cmd %s/%X,type %s/%X,whence %X) with return value %X and errno %X.\n" + "If the file system is NFS, you need to use NFS version 3, ensure that the lockd daemon is running on all the machines, and mount the directory with the 'noac' option (no attribute caching).\n", + fd_sys, GEN_flock_cmd_to_string(cmd), cmd, + GEN_flock_type_to_string(type), type, whence, err, errno); + perror("PNCIO_GEN_SetLock64:"); + fprintf(stderr, "PNCIO_GEN_SetLock:offset %llu, length %llu\n", (unsigned long long) offset, + (unsigned long long) len); + MPI_Abort(MPI_COMM_WORLD, 1); + } + + error_code = (err == 0) ? MPI_SUCCESS : MPI_ERR_UNKNOWN; + return error_code; +} diff --git a/src/drivers/pncio/pncio_lustre_open.c b/src/drivers/pncio/pncio_lustre_open.c new file mode 100644 index 0000000000..b63e2f822f --- /dev/null +++ b/src/drivers/pncio/pncio_lustre_open.c @@ -0,0 +1,1196 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include /* open(), O_CREAT */ +#include /* open() */ +#include /* dirname() */ + +#ifdef HAVE_LIMITS_H +#include +#endif +#ifndef PATH_MAX +#define PATH_MAX 65535 +#endif + +#ifdef HAVE_SYS_STAT_H +#include /* open(), fstat() */ +#endif + +#include + +#include "pncio.h" + +#ifdef MIMIC_LUSTRE +#define xstr(s) str(s) +#define str(s) #s +#define STRIPE_SIZE 64 +#define STRIPE_COUNT 4 +#endif + +#ifdef HAVE_LUSTRE +/* /usr/include/lustre/lustreapi.h + * /usr/include/linux/lustre/lustre_user.h + */ +#include + +#define PNETCDF_LUSTRE_DEBUG +// #define PNETCDF_LUSTRE_DEBUG_VERBOSE + +#define PATTERN_STR(pattern, int_str) ( \ + (pattern == LLAPI_LAYOUT_DEFAULT) ? "LLAPI_LAYOUT_DEFAULT" : \ + (pattern == LLAPI_LAYOUT_RAID0) ? "LLAPI_LAYOUT_RAID0" : \ + (pattern == LLAPI_LAYOUT_WIDE) ? "LLAPI_LAYOUT_WIDE" : \ + (pattern == LLAPI_LAYOUT_MDT) ? "LLAPI_LAYOUT_MDT" : \ + (pattern == LLAPI_LAYOUT_OVERSTRIPING) ? "LLAPI_LAYOUT_OVERSTRIPING" : \ + (pattern == LLAPI_LAYOUT_SPECIFIC) ? "LLAPI_LAYOUT_SPECIFIC" : \ + int_str) + +#define PRINT_LAYOUT(val) { \ + char int_str[32]; \ + snprintf(int_str, 32, "%lu", val); \ + printf("\t%-14s = %-25s (0x%lx)\n",#val,PATTERN_STR(val, int_str),val); \ +} + +#ifdef HAVE_LLAPI_GET_OBD_COUNT + +/*----< get_total_avail_osts() >---------------------------------------------*/ +static +int get_total_avail_osts(const char *path) +{ + char *dir_path=NULL, *path_copy=NULL; + int err, ost_count=0, is_mdt=0; + struct stat sb; + + path_copy = NCI_Strdup(path); + + err = stat(path_copy, &sb); + if (errno == ENOENT) { /* file does not exist, try folder */ + /* get the parent folder name */ + dir_path = dirname(path_copy); + err = stat(dir_path, &sb); + } + if (err != 0) { + printf("Warning at %s (%d): path \"%s\" stat() failed (%s)\n", + __func__,__LINE__,path,strerror(errno)); + goto err_out; + } + + /* llapi_get_obd_count() only works for directories */ + if (S_ISDIR(sb.st_mode)) + dir_path = (dir_path == NULL) ? path_copy : dir_path; + else + /* get the parent folder name */ + dir_path = dirname(path_copy); + + err = llapi_get_obd_count(dir_path, &ost_count, is_mdt); + if (err != 0) { + printf("Warning at %d: path \"%s\" llapi_get_obd_count() failed (%s)\n", + __LINE__,dir_path,strerror(errno)); + ost_count = 0; + } + +err_out: + if (path_copy != NULL) NCI_Free(path_copy); + + return ost_count; +} + +#else + +/*----< get_total_avail_osts() >---------------------------------------------*/ +static +int get_total_avail_osts(const char *filename) +{ + char *dirc=NULL, *dname, *tail, **members=NULL, *buffer=NULL; + char pool_name[64], fsname[64], full_pool_name[128]; + int err, dd, num_members=0; + int max_members = 2048; /* Maximum number of members to retrieve */ + int buffer_size = 1048576; /* Buffer size for member names */ + struct llapi_layout *layout=NULL; + + dirc = NCI_Strdup(filename); + + struct stat sb; + if (stat(filename, &sb) == 0 && S_ISDIR(sb.st_mode)) + dname = dirc; + else + /* find the parent folder name */ + dname = dirname(dirc); + + dd = open(dname, O_RDONLY, 0600); + if (dd < 0) { +#ifdef PNETCDF_LUSTRE_DEBUG + printf("Error at %s (%d) fails to open folder %s (%s)\n", + __FILE__,__LINE__, dname, strerror(errno)); +#endif + goto err_out; + } + + /* obtain Lustre layout object */ + layout = llapi_layout_get_by_fd(dd, LLAPI_LAYOUT_GET_COPY); + if (layout == NULL) { +#ifdef PNETCDF_LUSTRE_DEBUG + printf("Error at %s (%d) llapi_layout_get_by_fd() fails (%s)\n", + __FILE__, __LINE__,strerror(errno)); +#endif + goto err_out; + } + + /* find the pool name */ + err = llapi_layout_pool_name_get(layout, pool_name, sizeof(pool_name)-1); + if (err < 0) { +#ifdef PNETCDF_LUSTRE_DEBUG + printf("Error at %s (%d) llapi_layout_pool_name_get() fails (%s)\n", + __FILE__, __LINE__,strerror(errno)); +#endif + goto err_out; + } + else if (pool_name[0] == '\0') { +#ifdef PNETCDF_LUSTRE_DEBUG + printf("%s at %d: %s has NO Pool Name\n",__FILE__, __LINE__,dname); +#endif + goto err_out; + } + /* For example, Perlmutter @NERSC, pool_name "original" is returned */ + + /* Using pool_name returned from llapi_layout_pool_name_get() is not enough + * when calling llapi_get_poolmembers(). We need to prepend it with + * 'fsname', which can be obtained by calling llapi_getname(). Note that + * console command 'lfs getname -n' returns fsname. For example, on + * Perlmutter @NERSC: + * login39::~/Lustre(12:52) #1165 lfs getname -n $SCRATCH/dummy + * scratch + */ + err = llapi_getname(dname, fsname, 63); + if (err < 0) { +#ifdef PNETCDF_LUSTRE_DEBUG + printf("Error at %s (%d) llapi_getname() fails (%s)\n", + __FILE__, __LINE__,strerror(errno)); +#endif + goto err_out; + } + + /* When dname is a folder, fsname returned from llapi_getname() may contain + * a trailing ID, e.g. scratch-ffff9ca88d9bd800. Must remove the trailing + * ID, otherwise llapi_get_poolmembers() is not able to find it. + */ + tail = strchr(fsname, '-'); + if (tail != NULL) *tail = '\0'; + + /* In case either pool_name and fsname are empty. For example, on Polaris + * @ALCF, the returned pool_name is empty, but fsname is not. + */ + if (pool_name[0] == '\0' && fsname[0] == '\0') + goto err_out; + else if (pool_name[0] == '\0') + strcpy(full_pool_name, fsname); + else if (fsname[0] == '\0') + strcpy(full_pool_name, pool_name); + else + sprintf(full_pool_name, "%s.%s", fsname, pool_name); + +#ifdef PNETCDF_LUSTRE_DEBUG_VERBOSE + printf("%s at %d: file=%s dir=%s pool=%s fsname=%s full_pool_name=%s\n", + __func__,__LINE__, filename,dname,pool_name,fsname,full_pool_name); +#endif + + /* Allocate memory for the members and buffer */ + members = (char **)NCI_Malloc(max_members * sizeof(char *)); + buffer = (char *)NCI_Malloc(buffer_size); + + /* obtain pool's info */ + num_members = llapi_get_poolmembers(full_pool_name, members, max_members, + buffer, buffer_size); +#ifdef PNETCDF_LUSTRE_DEBUG_VERBOSE + if (num_members > 0) { + int i, min_nmembers = MIN(num_members, 10); + printf("%s at %d: Found %d members for pool '%s':\n", + __func__,__LINE__,num_members, pool_name); + printf("\tFirst %d OSTs and last are\n",min_nmembers); + for (i=0; i= 0) close(dd); + if (layout != NULL) llapi_layout_free(layout); + if (dirc != NULL) NCI_Free(dirc); + if (buffer != NULL) NCI_Free(buffer); + if (members != NULL) NCI_Free(members); + + return num_members; +} +#endif + +static +int compare(const void *a, const void *b) +{ + if (*(uint64_t*)a > *(uint64_t*)b) return (1); + if (*(uint64_t*)a < *(uint64_t*)b) return (-1); + return (0); +} + +static +int sort_ost_ids(struct llapi_layout *layout, + uint64_t stripe_count, + uint64_t *osts) +{ + uint64_t i, numOSTs; + + for (i=0; i osts[numOSTs]) + osts[++numOSTs] = osts[i]; + + return (numOSTs + 1); +} + +/*----< get_striping() >-----------------------------------------------------*/ +static +uint64_t get_striping(int fd, + const char *path, + uint64_t *pattern, + uint64_t *stripe_count, + uint64_t *stripe_size, + uint64_t *start_iodevice) +{ + int err; + struct llapi_layout *layout; + uint64_t *osts=NULL, numOSTs=0; +#ifdef PNETCDF_LUSTRE_DEBUG + char int_str[32]; +#endif + + *pattern = LLAPI_LAYOUT_RAID0; + *stripe_count = LLAPI_LAYOUT_DEFAULT; + *stripe_size = LLAPI_LAYOUT_DEFAULT; + *start_iodevice = LLAPI_LAYOUT_DEFAULT; + + layout = llapi_layout_get_by_fd(fd, LLAPI_LAYOUT_GET_COPY); + if (layout == NULL) { +#ifdef PNETCDF_LUSTRE_DEBUG + printf("Error at %s (%d) llapi_layout_get_by_fd() fails\n", + __FILE__, __LINE__); +#endif + goto err_out; + } + + err = llapi_layout_pattern_get(layout, pattern); + if (err != 0) { +#ifdef PNETCDF_LUSTRE_DEBUG + snprintf(int_str, 32, "%lu", *pattern); + printf("Error at %s (%d) llapi_layout_pattern_get() fails to get pattern %s\n", + __FILE__, __LINE__, PATTERN_STR(*pattern, int_str)); +#endif + goto err_out; + } + + /* obtain file striping count */ + err = llapi_layout_stripe_count_get(layout, stripe_count); + if (err != 0) { +#ifdef PNETCDF_LUSTRE_DEBUG + snprintf(int_str, 32, "%lu", *stripe_count); + printf("Error at %s (%d) llapi_layout_stripe_count_get() fails to get stripe count %s\n", + __FILE__, __LINE__, PATTERN_STR(*stripe_count, int_str)); +#endif + goto err_out; + } + + /* obtain file striping unit size */ + err = llapi_layout_stripe_size_get(layout, stripe_size); + if (err != 0) { +#ifdef PNETCDF_LUSTRE_DEBUG + snprintf(int_str, 32, "%lu", *stripe_size); + printf("Error at %s (%d) llapi_layout_stripe_size_get() fails to get stripe size %s\n", + __FILE__,__LINE__, PATTERN_STR(*stripe_size, int_str)); +#endif + goto err_out; + } + + /* /usr/include/linux/lustre/lustre_user.h + * The stripe size fields are shared for the extension size storage, + * however the extension size is stored in KB, not bytes. + * #define SEL_UNIT_SIZE 1024llu + * Therefore, the default stripe_size is (SEL_UNIT_SIZE * 1024) + */ + + if (*stripe_count == LLAPI_LAYOUT_DEFAULT || /* not set */ + *stripe_count == LLAPI_LAYOUT_INVALID || /* invalid */ + *stripe_count == LLAPI_LAYOUT_WIDE || /* all system's OSTs */ + *stripe_count > 1048576) { /* abnormally large number */ + return 0; + } + + /* obtain all OST IDs */ + osts = (uint64_t*) NCI_Malloc(sizeof(uint64_t) * (*stripe_count)); + if (llapi_layout_ost_index_get(layout, 0, &osts[0]) != 0) { + /* check if is a folder */ + struct stat path_stat; + fstat(fd, &path_stat); +#ifdef PNETCDF_LUSTRE_DEBUG_VERBOSE + if (S_ISREG(path_stat.st_mode)) /* not a regular file */ + printf("%s at %d: %s is a regular file\n",__func__,__LINE__,path); + else if (S_ISDIR(path_stat.st_mode)) + printf("%s at %d: %s is a folder\n",__func__,__LINE__,path); + else +#endif + if (!S_ISREG(path_stat.st_mode) && /* not a regular file */ + !S_ISDIR(path_stat.st_mode)) { /* not a folder */ +#ifdef PNETCDF_LUSTRE_DEBUG + printf("Error at %s (%d) calling fstat() file %s (neither a regular file nor a folder)\n", \ + __FILE__, __LINE__, path); +#endif + goto err_out; + } + + *start_iodevice = LLAPI_LAYOUT_DEFAULT; + numOSTs = *stripe_count; + + goto err_out; + } + *start_iodevice = osts[0]; + + numOSTs = sort_ost_ids(layout, *stripe_count, osts); + assert(numOSTs <= *stripe_count); + +err_out: + if (osts != NULL) NCI_Free(osts); + if (layout != NULL) llapi_layout_free(layout); + + return numOSTs; +} + +/*----< set_striping() >-----------------------------------------------------*/ +static +int set_striping(const char *path, + uint64_t pattern, + uint64_t numOSTs, + uint64_t stripe_count, + uint64_t stripe_size, + uint64_t start_iodevice) +{ + int fd=-1, err=0; + + struct llapi_layout *layout = llapi_layout_alloc(); + if (layout == NULL) { +#ifdef PNETCDF_LUSTRE_DEBUG + printf("Error at %s (%d) llapi_layout_alloc() fails (%s)\n", + __FILE__, __LINE__, strerror(errno)); +#endif + goto err_out; + } + + /* When an abnormally large stripe_count is set by users, Lustre may just + * allocate the total number of available OSTs, instead of returning an + * error. + */ + err = llapi_layout_stripe_count_set(layout, stripe_count); + if (err != 0) { +#ifdef PNETCDF_LUSTRE_DEBUG + printf("Error at %s (%d) llapi_layout_stripe_count_set() fails set stripe count %lu (%s)\n", + __FILE__, __LINE__, stripe_count, strerror(errno)); +#endif + goto err_out; + } + + err = llapi_layout_stripe_size_set(layout, stripe_size); + if (err != 0) { +#ifdef PNETCDF_LUSTRE_DEBUG + printf("Error at %s (%d) llapi_layout_stripe_size_set() fails to set stripe size %lu (%s)\n", + __FILE__, __LINE__, stripe_size, strerror(errno)); +#endif + goto err_out; + } + + if (pattern == LLAPI_LAYOUT_OVERSTRIPING) { + uint64_t i, ost_id; + if (start_iodevice == LLAPI_LAYOUT_DEFAULT) + start_iodevice = 0; + for (i=0; i------------------------------------------*/ +/* Construct the list of I/O aggregators. It sets the followings. + * fd->hints->cb_nodes and set file info for hint cb_nodes. + * fd->hints->ranklist[], an int array of size fd->hints->cb_nodes. + * fd->is_agg: indicating whether this rank is an I/O aggregator + * fd->my_cb_nodes_index: index into fd->hints->ranklist[]. -1 if N/A + */ +static +int Lustre_set_cb_node_list(PNCIO_File *fd) +{ + int i, j, k, rank, nprocs, num_aggr, striping_factor; + int *nprocs_per_node, **ranks_per_node; + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &rank); + + /* number of MPI processes running on each node */ + nprocs_per_node = (int *) NCI_Calloc(fd->node_ids.num_nodes, sizeof(int)); + + for (i=0; inode_ids.ids[i]]++; + + /* construct rank IDs of MPI processes running on each node */ + ranks_per_node = (int **) NCI_Malloc(sizeof(int*) * fd->node_ids.num_nodes); + ranks_per_node[0] = (int *) NCI_Malloc(sizeof(int) * nprocs); + for (i=1; inode_ids.num_nodes; i++) + ranks_per_node[i] = ranks_per_node[i - 1] + nprocs_per_node[i - 1]; + + for (i=0; inode_ids.num_nodes; i++) nprocs_per_node[i] = 0; + + /* Populate ranks_per_node[], list of MPI ranks running on each node. + * Populate nprocs_per_node[], number of MPI processes on each node. + */ + for (i=0; inode_ids.ids[i]; + ranks_per_node[k][nprocs_per_node[k]] = i; + nprocs_per_node[k]++; + } + + /* To save a call to MPI_Bcast(), all processes run the same codes below to + * calculate num_aggr, the number of aggregators (later becomes cb_nodes). + * + * The calculation is based on the number of compute nodes, + * fd->node_ids.num_nodes, and processes per node, nprocs_per_node. At this + * moment, all processes should have obtained the Lustre file striping + * settings. + */ + striping_factor = fd->hints->striping_factor; + + if (striping_factor > nprocs) { + /* When number of MPI processes is less than striping_factor, set + * num_aggr to the max number less than nprocs that divides + * striping_factor. An naive way is: + * num_aggr = nprocs; + * while (striping_factor % num_aggr > 0) + * num_aggr--; + * Below is equivalent, but faster. + */ + int divisor = 2; + num_aggr = 1; + /* try to divide */ + while (striping_factor >= divisor * divisor) { + if ((striping_factor % divisor) == 0) { + if (striping_factor / divisor <= nprocs) { + /* The value is found ! */ + num_aggr = striping_factor / divisor; + break; + } + /* if divisor is less than nprocs, divisor is a solution, + * but it is not sure that it is the best one + */ + else if (divisor <= nprocs) + num_aggr = divisor; + } + divisor++; + } + } + else { /* striping_factor <= nprocs */ + /* Select striping_factor processes to be I/O aggregators. Note this + * also applies to collective reads to allow more/less aggregators. In + * most cases, more aggregators yields better read performance. + */ + if (fd->hints->cb_nodes == 0) { + /* User did not set hint "cb_nodes" */ + if (nprocs >= striping_factor * 8 && + nprocs/fd->node_ids.num_nodes >= 8) + num_aggr = striping_factor * 8; + else if (nprocs >= striping_factor * 4 && + nprocs/fd->node_ids.num_nodes >= 4) + num_aggr = striping_factor * 4; + else if (nprocs >= striping_factor * 2 && + nprocs/fd->node_ids.num_nodes >= 2) + num_aggr = striping_factor * 2; + else + num_aggr = striping_factor; + } + else if (fd->hints->cb_nodes <= striping_factor) { + /* User has set hint cb_nodes and cb_nodes <= striping_factor. + * Ignore user's hint and try to set cb_nodes to be at least + * striping_factor. + */ + num_aggr = striping_factor; + } + else { + /* User has set hint cb_nodes and cb_nodes > striping_factor */ + if (nprocs < fd->hints->cb_nodes) + num_aggr = nprocs; /* BAD cb_nodes set by users */ + else + num_aggr = fd->hints->cb_nodes; + } + + /* Number of processes per node may not be enough to be picked as + * aggregators. If this case, reduce num_aggr (cb_nodes). Consider the + * following case: + * number of nodes = 7, + * number of processes = 18, + * striping_factor = 8, + * cb_nodes = 16. + * Nodes in this case, nodes 0, 1, 2, 3 run 3 processes each and nodes + * 4, 5, 6 run 2 processes each. In order to keep each OST only + * accessed by one or more aggregators running on the same compute + * node, cb_nodes should be reduced to 8. Thus the ranks of aggregators + * become 0, 3, 6, 9, 12, 14, 16, 1. The aggregator-OST mapping + * becomes below. + * Aggregator 0, running on node 0, access OST 0. + * Aggregator 3, running on node 1, access OST 1. + * Aggregator 6, running on node 2, access OST 2. + * Aggregator 9, running on node 3, access OST 3. + * Aggregator 12, running on node 4, access OST 4. + * Aggregator 14, running on node 5, access OST 5. + * Aggregator 16, running on node 6, access OST 6. + * Aggregator 1, running on node 0, access OST 7. + * + * Another case (the total number of processes changes to 25): + * number of nodes = 7, + * number of processes = 25, + * striping_factor = 8, + * cb_nodes = 16. + * In this case, nodes 0, 1, 2, 3 run 4 processes each and nodes 4, 5, + * 6 run 3 processes each. cb_nodes should remain 16 and the ranks of + * aggregators become 0, 4, 8, 12, 16, 19, 22, 1, 2, 6, 10, 14, 18, 21, + * 24, 3. The aggregator-OST mapping becomes below. + * Aggregators 0, 2, running on node 0, access OST 0. + * Aggregators 4, 6, running on node 1, access OST 1. + * Aggregators 8, 10, running on node 2, access OST 2. + * Aggregators 12, 14, running on node 3, access OST 3. + * Aggregators 16, 18, running on node 4, access OST 4. + * Aggregators 19, 21, running on node 5, access OST 5. + * Aggregators 22, 24, running on node 6, access OST 6. + * Aggregator 3, running on node 0, access OST 7. + */ + int max_nprocs_node = 0; + for (i=0; inode_ids.num_nodes; i++) + max_nprocs_node = MAX(max_nprocs_node, nprocs_per_node[i]); + int max_naggr_node = striping_factor / fd->node_ids.num_nodes; + if (striping_factor % fd->node_ids.num_nodes) max_naggr_node++; + /* max_naggr_node is the max number of processes per node to be picked + * as aggregator in each round. + */ + int rounds = num_aggr / striping_factor; + if (num_aggr % striping_factor) rounds++; + while (max_naggr_node * rounds > max_nprocs_node) rounds--; + num_aggr = striping_factor * rounds; + } + + /* TODO: the above setting for num_aggr is for collective writes. Should + * collective reads use the same? Or just set cb_nodes to the number of + * nodes. + */ + + /* Next step is to determine the MPI rank IDs of I/O aggregators and add + * them into ranklist[]. Note fd->hints->ranklist will be freed in + * PNCIO_File_close(). + */ + fd->hints->ranklist = (int *) NCI_Malloc(num_aggr * sizeof(int)); + if (fd->hints->ranklist == NULL) + return NC_ENOMEM; + + int block_assignment=0; +#ifdef TRY_AGGR_BLOCK_ASSIGNMENT + { + char *env_str; + if ((env_str = getenv("PNETCDF_USE_BLOCK_ASSIGN")) != NULL) + block_assignment = (strcasecmp(env_str, "true") == 0) ? 1 : 0; + if (rank == 0) + printf("%s %d: PNETCDF_USE_BLOCK_ASSIGN = %d\n", + __func__,__LINE__,block_assignment); + } +#endif + + if (striping_factor <= fd->node_ids.num_nodes) { + /* When number of OSTs is less than number of compute nodes, first + * select number of nodes equal to the number of OSTs by spread the + * selection evenly across all compute nodes (i.e. with a stride + * between every 2 consecutive nodes). + * Selection of MPI ranks can be done in 2 ways. + * 1. block assignment + * Select ranks from a node and then move on to the next node. + * 2. cyclic assignment + * Select ranks round-robin across all selected nodes. + * Note when selecting ranks within a node, the ranks are evenly spread + * among all processes in the node. + */ + if (block_assignment) { + int n=0; + int remain = num_aggr % striping_factor; + int node_stride = fd->node_ids.num_nodes / striping_factor; + /* walk through each node and pick aggregators */ + for (j=0; jnode_ids.num_nodes; j+=node_stride) { + /* Selecting node IDs with a stride. j is the node ID */ + int nranks_per_node = num_aggr / striping_factor; + /* front nodes may have 1 more to pick */ + if (remain > 0 && j/node_stride < remain) nranks_per_node++; + int rank_stride = nprocs_per_node[j] / nranks_per_node; + for (k=0; khints->ranklist[n] = ranks_per_node[j][k*rank_stride]; + if (++n == num_aggr) { + j = fd->node_ids.num_nodes; /* break loop j */ + break; /* loop k */ + } + } + } + } + else { + int avg = num_aggr / striping_factor; + int stride = fd->node_ids.num_nodes / striping_factor; + if (num_aggr % striping_factor) avg++; + for (i = 0; i < num_aggr; i++) { + /* j is the selected node ID. This selection is round-robin + * across selected nodes. + */ + j = (i % striping_factor) * stride; + k = (i / striping_factor) * (nprocs_per_node[j] / avg); + assert(k < nprocs_per_node[j]); + fd->hints->ranklist[i] = ranks_per_node[j][k]; + } + } + } + else { /* striping_factor > fd->node_ids.num_nodes */ + /* When number of OSTs is more than number of compute nodes, I/O + * aggregators are selected from all nodes. Within each node, + * aggregators are spread evenly instead of the first few ranks. + */ + int *naggr_per_node, *idx_per_node, avg; + idx_per_node = (int*) NCI_Calloc(fd->node_ids.num_nodes, sizeof(int)); + naggr_per_node = (int*) NCI_Malloc(fd->node_ids.num_nodes * sizeof(int)); + for (i = 0; i < striping_factor % fd->node_ids.num_nodes; i++) + naggr_per_node[i] = striping_factor / fd->node_ids.num_nodes + 1; + for (; i < fd->node_ids.num_nodes; i++) + naggr_per_node[i] = striping_factor / fd->node_ids.num_nodes; + avg = num_aggr / striping_factor; + if (avg > 0) + for (i = 0; i < fd->node_ids.num_nodes; i++) + naggr_per_node[i] *= avg; + for (i = 0; i < fd->node_ids.num_nodes; i++) + naggr_per_node[i] = MIN(naggr_per_node[i], nprocs_per_node[i]); + /* naggr_per_node[] is the number of aggregators that can be + * selected as I/O aggregators + */ + + if (block_assignment) { + int n = 0; + for (j=0; jnode_ids.num_nodes; j++) { + /* j is the node ID */ + int rank_stride = nprocs_per_node[j] / naggr_per_node[j]; + /* try stride==1 seems no effect, rank_stride = 1; */ + for (k=0; khints->ranklist[n] = ranks_per_node[j][k*rank_stride]; + if (++n == num_aggr) { + j = fd->node_ids.num_nodes; /* break loop j */ + break; /* loop k */ + } + } + } + } + else { + for (i = 0; i < num_aggr; i++) { + int stripe_i = i % striping_factor; + j = stripe_i % fd->node_ids.num_nodes; /* select from node j */ + k = nprocs_per_node[j] / naggr_per_node[j]; + k *= idx_per_node[j]; + /* try stride==1 seems no effect, k = idx_per_node[j]; */ + idx_per_node[j]++; + assert(k < nprocs_per_node[j]); + fd->hints->ranklist[i] = ranks_per_node[j][k]; + } + } + NCI_Free(naggr_per_node); + NCI_Free(idx_per_node); + } + + /* TODO: we can keep these two arrays in case for dynamic construction + * of fd->hints->ranklist[], such as in group-cyclic file domain + * assignment method, used in each collective write call. + */ + NCI_Free(nprocs_per_node); + NCI_Free(ranks_per_node[0]); + NCI_Free(ranks_per_node); + + /* set file striping hints */ + fd->hints->cb_nodes = num_aggr; + + /* check whether this process is selected as an I/O aggregator */ + fd->is_agg = 0; + fd->my_cb_nodes_index = -1; + for (i = 0; i < num_aggr; i++) { + if (rank == fd->hints->ranklist[i]) { + fd->is_agg = 1; + fd->my_cb_nodes_index = i; + break; + } + } + + return 0; +} + +/*----< PNCIO_Lustre_create() >----------------------------------------------*/ +/* 1. root creates the file + * 2. root sets and obtains striping info + * 3. root broadcasts striping info + * 4. non-root processes receive striping info from root + * 5. non-root processes opens the file + */ +int +PNCIO_Lustre_create(PNCIO_File *fd, + int mpi_io_mode) +{ + char int_str[16]; + int err=NC_NOERR, rank, perm, old_mask; + int stripin_info[4] = {-1, -1, -1, -1}; +#ifdef HAVE_LUSTRE + int total_num_OSTs; + uint64_t numOSTs, pattern, stripe_count, stripe_size, start_iodevice; +#endif + +#ifdef WKL_DEBUG +extern int first_ost_id; +first_ost_id = -1; +#endif + + MPI_Comm_rank(fd->comm, &rank); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) +int world_rank; MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); +static int wkl=0; if (wkl == 0 && world_rank == 0) { printf("\nxxxx %s at %d: %s ---- %s\n",__func__,__LINE__,(fd->file_system == PNCIO_LUSTRE)?"PNCIO_LUSTRE":"PNCIO_UFS",fd->filename); wkl++; fflush(stdout);} +#endif + +#if defined(HAVE_LUSTRE) || defined(MIMIC_LUSTRE) +assert(mpi_io_mode & MPI_MODE_CREATE); + +/* Note ncmpi_create always creates a file with readable and writable permission. */ + int amode = O_CREAT; + if (mpi_io_mode & MPI_MODE_RDWR) amode |= O_RDWR; +#endif + + old_mask = umask(022); + umask(old_mask); + perm = old_mask ^ PNCIO_PERM; + + /* root process creates the file first, followed by all processes open the + * file. + */ + if (rank > 0) goto err_out; + + /* For Lustre, we need to obtain file striping info (striping_factor, + * striping_unit, and num_osts) in order to select the I/O aggregators + * in fd->hints->ranklist, no matter its is open or create mode. + */ + +// printf("fd->hints->nc_striping %s\n", (fd->hints->nc_striping == PNCIO_STRIPING_AUTO)?"AUTO":"INHERIT"); + +#ifdef HAVE_LUSTRE + int overstriping_ratio, str_factor, str_unit, start_iodev; + + /* In a call to PNCIO_File_SetInfo() earlier, hints have been validated to + * be consistent among all processes. + */ + + str_unit = fd->hints->striping_unit; + str_factor = fd->hints->striping_factor; + start_iodev = fd->hints->start_iodevice; + overstriping_ratio = fd->hints->lustre_overstriping_ratio; + + if (overstriping_ratio <= 0) /* hint not set of disabled */ + overstriping_ratio = 1; + + /* obtain the total number of OSTs available */ + total_num_OSTs = get_total_avail_osts(fd->filename); + if (total_num_OSTs <= 0) /* failed to obtain number of available OSTs */ + total_num_OSTs = PNCIO_LUSTRE_MAX_OSTS; + + /* make sure str_factor <= overstriping_ratio * total_num_OSTs */ + if (str_factor > overstriping_ratio * total_num_OSTs) + str_factor = overstriping_ratio * total_num_OSTs; + + numOSTs=0; + pattern = LLAPI_LAYOUT_DEFAULT; + stripe_count = LLAPI_LAYOUT_DEFAULT; + stripe_size = LLAPI_LAYOUT_DEFAULT; + start_iodevice = LLAPI_LAYOUT_DEFAULT; + + fd->fd_sys = -1; + + /* When no file striping hint is set, their default values are: + * fd->hints->striping_factor = 0; + * fd->hints->striping_unit = 0; + * fd->hints->start_iodevice = -1; + * fd->hints->lustre_overstriping_ratio = 1; + */ + + /* Now select file striping configuration for the new file. In many cases, + * the Lustre striping configuration of the file to be created is not + * explicitly set by the users (through I/O hints striping_factor and + * striping_unit) and the striping configuration of parent folder to store + * the new file is not explicitly set by the users. + * + * Codes below try to set the striping for the new file. Precedences are: + * 1. When hints striping_factor and striping_unit are explicitly set, they + * are used as the top precedence. + * 2. When hint nc_striping is set to "inherit", the striping will inherit + * from the parent folder. If the parent folder's striping count is not + * set, then this hint is ignored. + * 3. When no hint are set, set the new file's striping count to be equal + * to the number of compute nodes allocated to fd->comm and the striping + * size to 1 MiB. + */ + if (fd->hints->striping_factor == 0 && + fd->hints->nc_striping == PNCIO_STRIPING_INHERIT) { + /* Inherit the file striping settings from the parent folder. */ + int dd; + char *dirc, *dname; + + dirc = NCI_Strdup(fd->filename); + dname = dirname(dirc); /* folder name */ + + dd = open(dname, O_RDONLY, PNCIO_PERM); + + numOSTs = get_striping(dd, dname, &pattern, + &stripe_count, + &stripe_size, + &start_iodevice); + close(dd); + NCI_Free(dirc); + +#ifdef PNETCDF_LUSTRE_DEBUG_VERBOSE + printf("line %d: use parent folder's striping to set file's:\n",__LINE__); + PRINT_LAYOUT(numOSTs); + PRINT_LAYOUT(stripe_count); + PRINT_LAYOUT(stripe_size); + PRINT_LAYOUT(start_iodevice); + PRINT_LAYOUT(pattern); +#endif + } + + /* If hint striping_factor is not set by the user and the new file's folder + * has not set its striping parameters, then we set the number of unique + * OSTs, numOSTs, to the number of compute nodes allocated to this job, + * which sets stripe_count to (numOSTs * overstriping_ratio). + */ + if (str_factor == 0 && (stripe_count == LLAPI_LAYOUT_DEFAULT || + stripe_count == LLAPI_LAYOUT_WIDE)) { + stripe_count = MIN(fd->node_ids.num_nodes, total_num_OSTs); + if (overstriping_ratio > 1) stripe_count *= overstriping_ratio; + } + else if (str_factor > 0) + stripe_count = str_factor; + + /* When overstriping is requested by the user, calculate the number of + * unique OSTs. + */ + if (overstriping_ratio > 1) { + pattern = LLAPI_LAYOUT_OVERSTRIPING; + if (stripe_count < overstriping_ratio) + numOSTs = 1; + else + numOSTs = stripe_count / overstriping_ratio; + } + /* If ill values are detected, fall back to no overstriping */ + if (overstriping_ratio <= 1 || numOSTs == stripe_count) { + numOSTs = stripe_count; + pattern = LLAPI_LAYOUT_RAID0; + } + + /* If user has not set hint striping_unit and the folder's striping size is + * also not set, then use the default. + */ + if (str_unit == 0 && stripe_size == LLAPI_LAYOUT_DEFAULT) + stripe_size = LLAPI_LAYOUT_DEFAULT; + else if (str_unit > 0) + stripe_size = str_unit; + + /* If user has not set hint start_iodevice and the folder's start_iodevice + * is also not set, then use the default. + */ + if (start_iodev == -1 && start_iodevice == LLAPI_LAYOUT_DEFAULT) + start_iodevice = LLAPI_LAYOUT_DEFAULT; + else if (start_iodev > 0) + start_iodevice = start_iodev; + +#ifdef PNETCDF_LUSTRE_DEBUG_VERBOSE + printf("\n\tAfter adjust striping parameters become:\n"); + PRINT_LAYOUT(numOSTs); + PRINT_LAYOUT(stripe_count); + PRINT_LAYOUT(stripe_size); + PRINT_LAYOUT(start_iodevice); + PRINT_LAYOUT(pattern); +#endif + + /* create a new file and set striping */ + fd->fd_sys = set_striping(fd->filename, pattern, + numOSTs, + stripe_count, + stripe_size, + start_iodevice); + + if (fd->fd_sys < 0) + /* If explicitly setting file striping failed, inherit the striping + * from the folder by simply creating the file. + */ + fd->fd_sys = open(fd->filename, amode, perm); + + if (fd->fd_sys < 0) { + fprintf(stderr,"Error at %s (%d) fails to create file %s (%s)\n", + __FILE__,__LINE__, fd->filename, strerror(errno)); + err = ncmpii_error_posix2nc("Lustre set striping"); + goto err_out; + } + fd->is_open = 1; + + /* Obtain Lustre file striping parameters actually set. */ + numOSTs = get_striping(fd->fd_sys, fd->filename, &pattern, + &stripe_count, + &stripe_size, + &start_iodevice); + + stripin_info[0] = stripe_size; + stripin_info[1] = stripe_count; + stripin_info[2] = start_iodevice; + stripin_info[3] = numOSTs; + +#elif defined(MIMIC_LUSTRE) + fd->fd_sys = open(fd->filename, amode, perm); + if (fd->fd_sys == -1) { + printf("%s line %d: rank %d fails to create file %s (%s)\n", + __FILE__,__LINE__, rank, fd->filename, strerror(errno)); + err = ncmpii_error_posix2nc("open"); + goto err_out; + } + fd->is_open = 1; + + char *env_str = getenv("MIMIC_STRIPE_SIZE"); + if (env_str != NULL) + stripin_info[0] = atoi(env_str); + else + stripin_info[0] = STRIPE_SIZE; + stripin_info[1] = STRIPE_COUNT; + stripin_info[2] = 0; + stripin_info[3] = STRIPE_COUNT; +#endif + +err_out: + MPI_Bcast(stripin_info, 4, MPI_INT, 0, fd->comm); + if (fd->file_system == PNCIO_LUSTRE && + (stripin_info[0] == -1 || stripin_info[3] == 0)) { + fprintf(stderr, "%s line %d: failed to create Lustre file %s\n", + __FILE__, __LINE__, fd->filename); + return err; + } + + fd->hints->striping_unit = stripin_info[0]; + fd->hints->striping_factor = stripin_info[1]; + fd->hints->start_iodevice = stripin_info[2]; + if (fd->file_system == PNCIO_LUSTRE) { + fd->hints->lustre_num_osts = stripin_info[3]; + fd->hints->lustre_overstriping_ratio = stripin_info[1] / stripin_info[3]; + } + + if (rank > 0) { /* non-root processes */ + fd->fd_sys = open(fd->filename, O_RDWR, perm); + if (fd->fd_sys == -1) { + fprintf(stderr,"%s line %d: rank %d failure to open file %s (%s)\n", + __FILE__,__LINE__, rank, fd->filename, strerror(errno)); + return ncmpii_error_posix2nc("ioctl"); + } + fd->is_open = 1; + } + + /* construct cb_nodes rank list */ + Lustre_set_cb_node_list(fd); + + MPI_Info_set(fd->info, "romio_filesystem_type", "LUSTRE:"); + + snprintf(int_str, 16, "%d", fd->hints->lustre_num_osts); + MPI_Info_set(fd->info, "lustre_num_osts", int_str); + + snprintf(int_str, 16, "%d", fd->hints->lustre_overstriping_ratio); + MPI_Info_set(fd->info, "lustre_overstriping_ratio", int_str); + + return err; +} + +/*----< PNCIO_Lustre_open() >------------------------------------------------*/ +/* 1. all processes open the file. + * 2. root obtains striping info and broadcasts to all others + */ +int +PNCIO_Lustre_open(PNCIO_File *fd) +{ + char int_str[16]; + int err=NC_NOERR, rank, perm, old_mask; + int stripin_info[4] = {1048576, -1, -1, -1}; + +#ifdef WKL_DEBUG +extern int first_ost_id; +first_ost_id = -1; +#endif + + MPI_Comm_rank(fd->comm, &rank); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) +int world_rank; MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); +static int wkl=0; if (wkl == 0 && world_rank == 0) { printf("\nxxxx %s at %d: %s ---- %s\n",__func__,__LINE__,(fd->file_system == PNCIO_LUSTRE)?"PNCIO_LUSTRE":"PNCIO_UFS",fd->filename); wkl++; fflush(stdout);} +#endif + + old_mask = umask(022); + umask(old_mask); + perm = old_mask ^ PNCIO_PERM; + + int omode = (fd->access_mode & MPI_MODE_RDWR) ? O_RDWR : O_RDONLY; + + /* All processes open the file. */ + fd->fd_sys = open(fd->filename, omode, perm); + if (fd->fd_sys == -1) { + fprintf(stderr,"%s line %d: rank %d fails to open file %s (%s)\n", + __FILE__,__LINE__, rank, fd->filename, strerror(errno)); + err = ncmpii_error_posix2nc("open"); + goto err_out; + } + fd->is_open = 1; + + /* Only root obtains the striping information and bcast to all other + * processes. + */ + if (rank == 0) { +#ifdef HAVE_LUSTRE + uint64_t numOSTs=0; + uint64_t pattern = LLAPI_LAYOUT_DEFAULT; + uint64_t stripe_count = LLAPI_LAYOUT_DEFAULT; + uint64_t stripe_size = LLAPI_LAYOUT_DEFAULT; + uint64_t start_iodevice = LLAPI_LAYOUT_DEFAULT; + + numOSTs = get_striping(fd->fd_sys, fd->filename, &pattern, + &stripe_count, + &stripe_size, + &start_iodevice); + + stripin_info[0] = stripe_size; + stripin_info[1] = stripe_count; + stripin_info[2] = start_iodevice; + stripin_info[3] = numOSTs; + +#elif defined(MIMIC_LUSTRE) + char *env_str = getenv("MIMIC_STRIPE_SIZE"); + if (env_str != NULL) + stripin_info[0] = atoi(env_str); + else + stripin_info[0] = STRIPE_SIZE; + stripin_info[1] = STRIPE_COUNT; + stripin_info[2] = 0; + stripin_info[3] = STRIPE_COUNT; +#endif + } + +err_out: + MPI_Bcast(stripin_info, 4, MPI_INT, 0, fd->comm); + fd->hints->striping_unit = stripin_info[0]; + fd->hints->striping_factor = stripin_info[1]; + fd->hints->start_iodevice = stripin_info[2]; + fd->hints->lustre_num_osts = stripin_info[3]; + fd->hints->lustre_overstriping_ratio = stripin_info[1] / stripin_info[3]; + + /* construct cb_nodes rank list */ + Lustre_set_cb_node_list(fd); + + MPI_Info_set(fd->info, "romio_filesystem_type", "LUSTRE:"); + + snprintf(int_str, 16, "%d", fd->hints->lustre_num_osts); + MPI_Info_set(fd->info, "lustre_num_osts", int_str); + + snprintf(int_str, 16, "%d", fd->hints->lustre_overstriping_ratio); + MPI_Info_set(fd->info, "lustre_overstriping_ratio", int_str); + + return err; +} + diff --git a/src/drivers/pncio/pncio_lustre_wrcoll.c b/src/drivers/pncio/pncio_lustre_wrcoll.c new file mode 100644 index 0000000000..9e412d8e3f --- /dev/null +++ b/src/drivers/pncio/pncio_lustre_wrcoll.c @@ -0,0 +1,2441 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + +static int use_alltoallw; + +#ifdef HAVE_MPI_LARGE_COUNT +#define MEMCPY_UNPACK(x, inbuf, start, count, outbuf) { \ + int _k; \ + char *_ptr = (inbuf); \ + MPI_Count *mem_ptrs = others_req[x].mem_ptrs + (start); \ + MPI_Offset *mem_lens = others_req[x].lens + (start); \ + for (_k=0; _khints->striping_unit; + + avail_bytes = (stripe_id + 1) * fd->hints->striping_unit - off; + if (avail_bytes < *len) { + /* The request [off, off+len) has only [off, off+avail_bytes) part + * falling into aggregator's file domain */ + *len = avail_bytes; + } + /* return the index to ranklist[] */ + return (stripe_id % fd->hints->cb_nodes); +} + +/*----< LUSTRE_Calc_my_req() >-----------------------------------------------*/ +/* calculates what portions of the read/write requests of this process fall + * into the file domains of all I/O aggregators. + * IN: fd->flat_file: this rank's flattened write requests + * fd->flat_file.count: number of noncontiguous offset-length file requests + * fd->flat_file.off[fd->flat_file.count] file offsets of individual + * noncontiguous requests. + * fd->flat_file.len[fd->flat_file.count] lengths of individual + * noncontiguous requests. + * IN: buf_is_contig: whether the write buffer is contiguous or not + * OUT: my_req_ptr[cb_nodes] offset-length pairs of this process's requests + * fall into the file domain of each aggregator. + * OUT: buf_idx_ptr[cb_nodes] index pointing to the starting location in + * user_buf for data to be sent to each aggregator. + */ +static +void LUSTRE_Calc_my_req(PNCIO_File *fd, + int buf_is_contig, + PNCIO_Access **my_req_ptr, + MPI_Offset **buf_idx) +{ + int aggr, *aggr_ranks, cb_nodes; + MPI_Count i, l; + size_t nelems, alloc_sz; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset rem_len, avail_len, *avail_lens; +#else + int rem_len, avail_len, *avail_lens; +#endif + MPI_Offset curr_idx, off; + PNCIO_Access *my_req; + + /* fd->flat_file.count has been checked and adjusted to a possitive number + * at the beginning of PNCIO_LUSTRE_WriteStridedColl(). + */ + assert(fd->flat_file.count > 0); + + cb_nodes = fd->hints->cb_nodes; + + /* my_req[i].count gives the number of contiguous requests of this process + * that fall in aggregator i's file domain (not process MPI rank i). + */ + my_req = (PNCIO_Access *) NCI_Calloc(cb_nodes, sizeof(PNCIO_Access)); + *my_req_ptr = my_req; + + /* First pass is just to calculate how much space is needed to allocate + * my_req. + */ +#ifdef HAVE_MPI_LARGE_COUNT + alloc_sz = sizeof(int) + sizeof(MPI_Offset); + aggr_ranks = (int*) NCI_Malloc(alloc_sz * fd->flat_file.count); + avail_lens = (MPI_Offset*) (aggr_ranks + fd->flat_file.count); +#else + alloc_sz = sizeof(int) * 2; + aggr_ranks = (int*) NCI_Malloc(alloc_sz * fd->flat_file.count); + avail_lens = aggr_ranks + fd->flat_file.count; +#endif + + /* Note that MPI standard (MPI 3.1 Chapter 13.1.1 and MPI 4.0 Chapter + * 14.1.1) requires that the typemap displacements of etype and + * filetype are non-negative and monotonically non-decreasing. This + * makes fd->flat_file.off[] to be monotonically non-decreasing. + */ + +/* +Alternative: especially for when fd->flat_file.count is large +1 This rank's aggregate file access region is from start_offset to end_offset. +2 start with the 1st aggregator ID and keep assign aggregator until next stripe. + This can avoid too many calls to LUSTRE_Calc_aggregator() +*/ + + /* nelems will be the number of offset-length pairs for my_req[] */ + nelems = 0; + for (i = 0; i < fd->flat_file.count; i++) { + /* short circuit offset/len processing if zero-byte read/write. */ + if (fd->flat_file.len[i] == 0) + continue; + + off = fd->flat_file.off[i]; + avail_len = fd->flat_file.len[i]; + /* LUSTRE_Calc_aggregator() modifies the value of 'avail_len' to the + * amount that is only covered by the aggr's file domain. The remaining + * (tail) will continue to be processed to determine to whose file + * domain it belongs. As LUSTRE_Calc_aggregator() can be expensive for + * large value of fd->flat_file.count, we keep a copy of the returned + * values of 'aggr' and 'avail_len' in aggr_ranks[] and avail_lens[] to + * be used in the next for loop (not next iteration). + * + * Note the returned value in 'aggr' is the index to ranklist[], i.e. + * the 'aggr'th element of array ranklist[], rather than the + * aggregator's MPI rank ID in fd->comm. + */ + aggr = LUSTRE_Calc_aggregator(fd, off, &avail_len); + aggr_ranks[i] = aggr; /* first aggregator ID of this request */ + avail_lens[i] = avail_len; /* length covered, may be < fd->flat_file.len[i] */ + assert(aggr >= 0 && aggr <= cb_nodes); + my_req[aggr].count++; /* increment for aggregator aggr */ + nelems++; /* true number of noncontiguous requests + * in terms of file domains */ + + /* rem_len is the amount of ith offset-length pair that is not covered + * by aggregator aggr's file domain. + */ + rem_len = fd->flat_file.len[i] - avail_len; + assert(rem_len >= 0); + + while (rem_len > 0) { + off += avail_len; /* move forward to first remaining byte */ + avail_len = rem_len; /* save remaining size, pass to calc */ + aggr = LUSTRE_Calc_aggregator(fd, off, &avail_len); + my_req[aggr].count++; + nelems++; + rem_len -= avail_len;/* reduce remaining length by amount from fd */ + } + } + + /* allocate space for buf_idx. + * buf_idx is relevant only if buftype is contiguous. buf_idx[i] gives the + * starting index in user_buf where data will be sent to aggregator 'i'. + * This allows sends to be done without extra buffer. + */ + if (buf_idx != NULL && buf_is_contig) { + buf_idx[0] = (MPI_Offset *) NCI_Malloc(nelems * sizeof(MPI_Offset)); + for (i = 1; i < cb_nodes; i++) + buf_idx[i] = buf_idx[i - 1] + my_req[i - 1].count; + } + + /* allocate space for my_req and its members offsets and lens */ +#ifdef HAVE_MPI_LARGE_COUNT + alloc_sz = sizeof(MPI_Offset) * 2; + my_req[0].offsets = (MPI_Offset*) NCI_Malloc(alloc_sz * nelems); + my_req[0].lens = my_req[0].offsets + my_req[0].count; + for (i=1; iflat_file.count; i++) { + /* short circuit offset/len processing if zero-byte read/write. */ + if (fd->flat_file.len[i] == 0) + continue; + + off = fd->flat_file.off[i]; + aggr = aggr_ranks[i]; + assert(aggr >= 0 && aggr <= cb_nodes); + avail_len = avail_lens[i]; + + l = my_req[aggr].count; + if (buf_idx != NULL && buf_is_contig) { + buf_idx[aggr][l] = curr_idx; + curr_idx += avail_len; + } + rem_len = fd->flat_file.len[i] - avail_len; + + /* Each my_req[i] contains the number of this process's noncontiguous + * requests that fall into aggregator aggr's file domain. + * my_req[aggr].offsets[] and my_req[aggr].lens store the offsets and + * lengths of the requests. + */ + my_req[aggr].offsets[l] = off; + my_req[aggr].lens[l] = avail_len; + my_req[aggr].count++; + + while (rem_len != 0) { + off += avail_len; + avail_len = rem_len; + aggr = LUSTRE_Calc_aggregator(fd, off, &avail_len); + assert(aggr >= 0 && aggr <= cb_nodes); + l = my_req[aggr].count; + if (buf_idx != NULL && buf_is_contig) { + buf_idx[aggr][l] = curr_idx; + curr_idx += avail_len; + } + rem_len -= avail_len; + + my_req[aggr].offsets[l] = off; + my_req[aggr].lens[l] = avail_len; + my_req[aggr].count++; + } + } + NCI_Free(aggr_ranks); +} + +/* LUSTRE_Calc_others_req() calculates what requests from each of other + * processes fall in this aggregator's file domain. + * IN: my_req[cb_nodes]: offset-length pairs of this rank's requests fall + * into each of aggregators + * OUT: count_others_req_per_proc[i]: number of noncontiguous requests of + * rank i that falls in this aggregator's file domain. + * OUT: others_req_ptr[nprocs]: requests of each of other ranks fall into + * this aggregator's file domain. + */ +static +void LUSTRE_Calc_others_req(PNCIO_File *fd, + const PNCIO_Access *my_req, + PNCIO_Access **others_req_ptr) +{ + int i, myrank, nprocs, do_alltoallv, nreqs; + MPI_Count *count_my_req_per_proc, *count_others_req_per_proc; + PNCIO_Access *others_req; + size_t npairs, alloc_sz, pair_sz; + MPI_Request *requests; + + /* first find out how much to send/recv and from/to whom */ + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + + others_req = (PNCIO_Access *) NCI_Malloc(nprocs * sizeof(PNCIO_Access)); + *others_req_ptr = others_req; + + /* Use my_req[i].count (the number of noncontiguous requests fall in + * aggregator i's file domain) to set count_others_req_per_proc[j] (the + * number of noncontiguous requests from process j fall into this + * aggregator's file domain). + * + * The below MPI_Alltoall() is actually an all-to-many, i,e, all ranks + * send to aggregators only. + */ + count_my_req_per_proc = (MPI_Count *) NCI_Calloc(nprocs * 2, sizeof(MPI_Count)); + count_others_req_per_proc = count_my_req_per_proc + nprocs; + for (i=0; ihints->cb_nodes; i++) + count_my_req_per_proc[fd->hints->ranklist[i]] = my_req[i].count; + +#if 1 + requests = NCI_Malloc(sizeof(MPI_Request) * (nprocs + fd->hints->cb_nodes)); + nreqs = 0; + if (fd->is_agg) { + for (i=0; icomm, &requests[nreqs++]); + } + for (i=0; ihints->cb_nodes; i++) { + int dest = fd->hints->ranklist[i]; + MPI_Issend(&my_req[i].count, 1, MPI_COUNT, dest, 0, fd->comm, &requests[nreqs++]); + } + if (nreqs) { +#ifdef HAVE_MPI_STATUSES_IGNORE + MPI_Waitall(nreqs, requests, MPI_STATUSES_IGNORE); +#else + MPI_Status *statuses = (MPI_Status *) + NCI_Malloc(nreqs * sizeof(MPI_Status)); + MPI_Waitall(nreqs, requests, statuses); + NCI_Free(statuses); +#endif + } + NCI_Free(requests); +#else + MPI_Alltoall(count_my_req_per_proc, 1, MPI_COUNT, + count_others_req_per_proc, 1, MPI_COUNT, fd->comm); +#endif + + /* calculate total number of offset-length pairs to be handled by this + * aggregator, only aggregators will have non-zero number of pairs. + */ + npairs = 0; + for (i=0; inum_nodes > 0) ? (nprocs / fd->num_nodes > 48) : 0; +#else + do_alltoallv=0; +#endif + + if (do_alltoallv) { + MPI_Offset *r_off_buf=NULL, *s_off_buf=NULL; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *sendCounts, *recvCounts; + MPI_Aint *sdispls, *rdispls; + alloc_sz = sizeof(MPI_Count) * 2 + sizeof(MPI_Aint) * 2; + sendCounts = (MPI_Count*) NCI_Calloc(nprocs, alloc_sz); + recvCounts = sendCounts + nprocs; + sdispls = (MPI_Aint*) (recvCounts + nprocs); + rdispls = sdispls + nprocs; +#else + int *sendCounts, *recvCounts, *sdispls, *rdispls; + alloc_sz = sizeof(int) * 4; + sendCounts = (int*) NCI_Calloc(nprocs, alloc_sz); + recvCounts = sendCounts + nprocs; + sdispls = recvCounts + nprocs; + rdispls = sdispls + nprocs; +#endif + + /* prepare receive side */ + r_off_buf = others_req[0].offsets; + for (i=0; ihints->cb_nodes; i++) { + int dest = fd->hints->ranklist[i]; + sendCounts[dest] = my_req[i].count * pair_sz; + /* Note all my_req[*].offsets are allocated in a single malloc(). */ + sdispls[dest] = (char*)my_req[i].offsets - (char*)s_off_buf; + } + +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Alltoallv_c(s_off_buf, sendCounts, sdispls, MPI_BYTE, + r_off_buf, recvCounts, rdispls, MPI_BYTE, fd->comm); +#else + MPI_Alltoallv(s_off_buf, sendCounts, sdispls, MPI_BYTE, + r_off_buf, recvCounts, rdispls, MPI_BYTE, fd->comm); +#endif + + NCI_Free(sendCounts); + } + else { /* instead of using alltoall, use MPI_Issend and MPI_Irecv */ + requests = (MPI_Request *) + NCI_Malloc(sizeof(MPI_Request) * (nprocs + fd->hints->cb_nodes)); + + nreqs = 0; + for (i = 0; i < nprocs; i++) { + if (others_req[i].count == 0) /* nothing to receive from rank i */ + continue; + + /* Note the memory address of others_req[i].lens is right after + * others_req[i].offsets. This allows the following recv call to + * receive both offsets and lens in a single call. + */ + if (i == myrank) { + /* send to self uses memcpy(), here + * others_req[i].count == my_req[fd->my_cb_nodes_index].count + */ + memcpy(others_req[i].offsets, + my_req[fd->my_cb_nodes_index].offsets, + my_req[fd->my_cb_nodes_index].count * pair_sz); + } + else { +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Irecv_c(others_req[i].offsets, others_req[i].count*pair_sz, + MPI_BYTE, i, 0, fd->comm, &requests[nreqs++]); +#else + MPI_Irecv(others_req[i].offsets, others_req[i].count*pair_sz, + MPI_BYTE, i, 0, fd->comm, &requests[nreqs++]); +#endif + } + } + +#ifdef WKL_DEBUG +/* WRF hangs below when calling MPI_Waitall(), at running 16 nodes, 128 ranks + * per node on Perlmutter, when these 3 env variables are set: + * FI_UNIVERSE_SIZE = 2048 + * FI_CXI_DEFAULT_CQ_SIZE = 524288 + * FI_CXI_RX_MATCH_MODE = software + * + * Using MPI_Alltoallv seems to be able to avoid such hanging problem. (above) + */ +// MPI_Barrier(fd->comm); /* This barrier prevents the MPI_Waitall below from hanging !!! */ +#endif + + for (i=0; ihints->cb_nodes; i++) { + if (my_req[i].count == 0 || i == fd->my_cb_nodes_index) + continue; /* nothing to send or send to self */ + + /* Note the memory address of my_req[i].lens is right after + * my_req[i].offsets. This allows the following Issend call to + * send both offsets and lens in a single call. + */ +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Issend_c(my_req[i].offsets, my_req[i].count * pair_sz, MPI_BYTE, + fd->hints->ranklist[i], 0, fd->comm, &requests[nreqs++]); +#else + MPI_Issend(my_req[i].offsets, my_req[i].count * pair_sz, MPI_BYTE, + fd->hints->ranklist[i], 0, fd->comm, &requests[nreqs++]); +#endif + } + + if (nreqs) { +#ifdef HAVE_MPI_STATUSES_IGNORE + MPI_Waitall(nreqs, requests, MPI_STATUSES_IGNORE); +#else + MPI_Status *statuses = (MPI_Status *) + NCI_Malloc(nreqs * sizeof(MPI_Status)); + MPI_Waitall(nreqs, requests, statuses); + NCI_Free(statuses); +#endif + } + NCI_Free(requests); + } +} + +MPI_Offset PNCIO_LUSTRE_WriteStridedColl(PNCIO_File *fd, + const void *buf, + PNCIO_View buf_view, + MPI_Offset offset) +{ + /* Uses a generalized version of the extended two-phase method described in + * "An Extended Two-Phase Method for Accessing Sections of Out-of-Core + * Arrays", Rajeev Thakur and Alok Choudhary, Scientific Programming, + * (5)4:301--317, Winter 1996. + * http://www.mcs.anl.gov/home/thakur/ext2ph.ps + */ + + int i, j, nprocs, myrank; + int do_collect = 1, do_ex_wr; + MPI_Offset start_offset, end_offset; + MPI_Offset min_st_loc = -1, max_end_loc = -1; + MPI_Offset w_len=0; + +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset one_len = (MPI_Offset)buf_view.size; +#else + int one_len = (int)buf_view.size; +#endif + +// printf("%s %d: offset=%lld\n",__func__,__LINE__,offset); +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) +MPI_Barrier(fd->comm); +double curT = MPI_Wtime(); +#endif + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + + /* PnetCDF never reuses a fileview across two or more PNCIO calls. As this + * subroutine may modify the contents of fd->flat_file, we save its + * contents and restore it before leaving this sibroutine. + */ + PNCIO_View saved_flat_file = fd->flat_file; + + /* fd->flat_file contains a list of starting file offsets and lengths of + * write requests made by this rank. Similarly, buf_view contains a list of + * offset-length pairs describing the write buffer layout. Note as PnetCDF + * never re-uses a fileview or buffer view. + * + * Note that MPI standard (MPI 3.1 Chapter 13.1.1 and MPI 4.0 Chapter + * 14.1.1) requires that the typemap displacements of etype and filetype + * set by the user are non-negative and monotonically non-decreasing. This + * makes fd->flat_file.off[] to be monotonically non-decreasing. + * + * This rank's aggregate file access region is from start_offset to + * end_offset. Note: end_offset points to the last byte-offset to be + * accessed. E.g., if start_offset=0 and end_offset=99, then the aggregate + * file access region is of size 100 bytes. If this rank has no data to + * write, end_offset == (start_offset - 1) + */ + if (fd->flat_file.count == 0) { /* whole file is visible */ + /* set flat_file as a single contiguous offset-length pair */ + fd->flat_file.off = &offset; + fd->flat_file.len = &one_len; + fd->flat_file.size = one_len; + fd->flat_file.count = 1; + fd->flat_file.is_contig = 1; + start_offset = offset; + end_offset = offset + buf_view.size - 1; + } + else { /* Note flat_file.off[] is always relative to beginning of file */ + /* When flat_file is not contiguous, PnetCDF always calls this + * subroutine with offset == 0. + */ + assert(offset == 0); + start_offset = fd->flat_file.off[0]; + end_offset = fd->flat_file.off[fd->flat_file.count-1] + + fd->flat_file.len[fd->flat_file.count-1] - 1; + } +// if (myrank==0) printf("%s %d: fd->flat_file size=%lld count=%lld offset=%lld start_offset=%lld end_offset=%lld\n",__func__,__LINE__, fd->flat_file.size, fd->flat_file.count,offset,start_offset,end_offset); + + buf_view.idx = 0; + buf_view.rem = buf_view.size; + if (buf_view.size > 0 && buf_view.count > 1) + buf_view.rem = buf_view.len[0]; + + if (fd->hints->romio_cb_write == PNCIO_HINT_DISABLE) { + /* collective write is explicitly disabled by user */ + do_collect = 0; + } + else { + /* Calculate the aggregate access region of all ranks and check if + * write requests are interleaved among all ranks. + */ + int is_interleaved, large_indv_req = 1; + MPI_Offset striping_range, *st_end_all = NULL; + + /* Gather starting and ending file offsets of write requests from all + * ranks into st_end_all[]. Even indices of st_end_all[] are starting + * offsets, and odd indices are ending offsets. + */ +#if 0 + st_end_all = (MPI_Offset *) NCI_Calloc(nprocs * 2, sizeof(MPI_Offset)); + st_end_all[myrank*2] = start_offset; + st_end_all[myrank*2+1] = end_offset; + MPI_Allreduce(MPI_IN_PLACE, st_end_all, nprocs*2, MPI_OFFSET, MPI_MAX, fd->comm); +#else + MPI_Offset st_end[2]; + st_end[0] = start_offset; + st_end[1] = end_offset; + st_end_all = (MPI_Offset *) NCI_Malloc(nprocs * 2 * sizeof(MPI_Offset)); + MPI_Allgather(st_end, 2, MPI_OFFSET, st_end_all, 2, MPI_OFFSET, fd->comm); +#endif + + /* The loop below does the followings. + * 1. Calculate this rank's aggregate access region. + * 2. Check whether or not the requests are interleaved among all ranks. + * 3. Check whether there are LARGE individual requests. Here, "large" + * means a write range is > (striping_factor * striping_unit). In + * this case, independent write will perform faster than collective. + */ + striping_range = fd->hints->striping_unit * fd->hints->striping_factor; + is_interleaved = 0; + for (i = 0; i < nprocs * 2; i += 2) { + if (st_end_all[i] > st_end_all[i + 1]) { + /* process rank (i/2) has no data to write */ + continue; + } + min_st_loc = st_end_all[i]; + max_end_loc = st_end_all[i + 1]; + if (st_end_all[i+1] - st_end_all[i] < striping_range) + large_indv_req = 0; + j = i; /* j is the rank of making first non-zero request */ + i += 2; + break; + } + for (; i < nprocs * 2; i += 2) { + if (st_end_all[i] > st_end_all[i + 1]) { + /* process rank (i/2) has no data to write */ + continue; + } + if (st_end_all[i] < st_end_all[j+1]) { + /* start offset of process rank (i/2) is less than the end + * offset of process rank (i/2-1) + */ + is_interleaved = 1; + } + min_st_loc = MIN(st_end_all[i], min_st_loc); + max_end_loc = MAX(st_end_all[i + 1], max_end_loc); + if (st_end_all[i+1] - st_end_all[i] < striping_range) + large_indv_req = 0; + j = i; + } + NCI_Free(st_end_all); + +// if (myrank==0) printf("%s %d: do_collect=%d is_interleaved=%d buf_view size=%lld count=%lld is_contig=%d start_offset=%lld end_offset=%lld\n",__func__,__LINE__, do_collect,is_interleaved,buf_view.size,buf_view.count,buf_view.is_contig, start_offset,end_offset); + if (fd->hints->romio_cb_write == PNCIO_HINT_ENABLE) { + /* explicitly enabled by user */ + do_collect = 1; + } + else if (fd->hints->romio_cb_write == PNCIO_HINT_AUTO) { +// if (myrank==0) printf("%s %d: large_indv_req=%d cb_nodes=%d striping_factor=%d\n",__func__,__LINE__, large_indv_req,fd->hints->cb_nodes , fd->hints->striping_factor); + /* Check if collective write is actually necessary, only when + * romio_cb_write hint is set to PNCIO_HINT_AUTO. + * + * Two typical access patterns can benefit from collective write. + * 1) access file regions of all processes are interleaved, and + * 2) the individual request sizes are not too big, i.e. no + * bigger than striping_range. Large individual requests may + * result in a high communication cost in order to + * redistribute requests from non-aggregators to I/O + * aggregators. + */ + if (nprocs == 1) + do_collect = 0; + else if (!is_interleaved && large_indv_req && + fd->hints->cb_nodes <= fd->hints->striping_factor) { + /* do independent write, if every rank's write range > + * striping_range and writes are not interleaved in file + * space + */ + do_collect = 0; + } + } + } + + /* If collective I/O is determined not necessary, use independent I/O */ + if (!do_collect) { + + /* restore flattend file view before leaving this sibroutine */ + fd->flat_file = saved_flat_file; + + if (buf_view.size == 0) /* zero-sized request */ + return 0; + + if (fd->flat_file.is_contig && buf_view.is_contig) { + /* Both buffer and fileview are contiguous. Note when + * fd->flat_file.is_contig, it is still possible + * fd->flat_file.count > 0 and when this happens + * fd->flat_file.count should be 1, which comes from PnetCDF wait + * call and the number of nonblocking requests is 1. + */ + if (fd->flat_file.count > 0) offset += fd->flat_file.off[0]; +#ifdef WKL_DEBUG + printf("%s %d: SWITCH to PNCIO_WriteContig !!!\n",__func__,__LINE__); +#endif + + return PNCIO_WriteContig(fd, buf, buf_view.size, offset); + } + +#ifdef WKL_DEBUG + printf("%s %d: SWITCH to PNCIO_LUSTRE_WriteStrided !!!\n", + __func__,__LINE__); +#endif + + return PNCIO_LUSTRE_WriteStrided(fd, buf, buf_view, offset); + } + + /* Now we are using collective I/O (two-phase I/O strategy) */ + +#ifdef ADJUST_STRIPING_UNIT + /* adjust striping_unit when striping_factor is twice or more than the + * number of compute nodes. Note cb_node is set to at least + * striping_factor, if nprocs >= striping_factor. Adjustment below is to + * let each aggregator to write to two or more consecutive OSTs, which can + * most likely improve the performance. This will still yield an effect of + * any one OST receiving write requests from aggregators running on only + * one compute node. + */ + int orig_striping_unit = fd->hints->striping_unit; + + if (fd->hints->striping_factor >= fd->num_nodes * 2) { + fd->hints->striping_unit *= (fd->hints->striping_factor / fd->num_nodes); + + if (fd->hints->cb_buffer_size < fd->hints->striping_unit) { + char value[MPI_MAX_INFO_VAL + 1]; + + fd->hints->cb_buffer_size = fd->hints->striping_unit; + sprintf(value, "%d", fd->hints->cb_buffer_size); + MPI_Info_set(fd->info, "cb_buffer_size", value); + if (fd->is_agg) { + NCI_Free(fd->io_buf); + fd->io_buf = (void*) NCI_Calloc(1, fd->hints->cb_buffer_size); + } + } +#ifdef WKL_DEBUG + if (myrank == 0) + printf("Warning: %s line %d: Change striping_unit from %d to %d\n", + __func__, __LINE__, orig_striping_unit, fd->hints->striping_unit); +#endif + } +#endif + + /* my_req[cb_nodes] is an array of access info, one for each I/O aggregator + * whose file domain has this rank's request. + */ + PNCIO_Access *my_req; + + /* others_req[nprocs] is an array of access info, one for each ranks, both + * aggregators and non-aggregators, whose write requests fall into this + * aggregator's file domain. others_req[] matters only for aggregators. + */ + PNCIO_Access *others_req; + MPI_Offset **buf_idx = NULL; + + if (buf_view.is_contig) + buf_idx = (MPI_Offset **) NCI_Malloc(fd->hints->cb_nodes * + sizeof(MPI_Offset*)); + + /* Calculate the portions of this rank's write requests that fall into the + * file domains of each I/O aggregator. No inter-process communication is + * performed in LUSTRE_Calc_my_req(). + */ + LUSTRE_Calc_my_req(fd, buf_view.is_contig, &my_req, buf_idx); + + if (fd->hints->romio_ds_write != PNCIO_HINT_DISABLE) { + /* When data sieving is considered, below check the current file size + * first. If the aggregate access region of this collective write is + * beyond the current file size, then we can safely skip the read of + * the read-modify-write of data sieving. + */ + if (fd->is_agg) { + /* Obtain the current file size. Note an MPI_Allgather() has been + * called above to calculate the aggregate access region. Thus all + * prior independent I/O should have completed by now, so it is + * safe to call lseek() to query the file size. + */ + MPI_Offset cur_off, fsize; + + cur_off = lseek(fd->fd_sys, 0, SEEK_CUR); + fsize = lseek(fd->fd_sys, 0, SEEK_END); + /* Ignore the error, and proceed as if file size is very large. */ +#ifdef PNETCDF_DEBUG + if (fsize == -1) + fprintf(stderr, "%s at %d: lseek SEEK_END failed on file %s (%s)\n", + __func__,__LINE__, fd->filename, strerror(errno)); +#endif + fd->skip_read = (fsize >=0 && min_st_loc >= fsize); + + /* restore file pointer */ + lseek(fd->fd_sys, cur_off, SEEK_SET); + } + } + else + fd->skip_read = 1; + +// if (fd->is_agg && !fd->skip_read) { MPI_Offset fsize = lseek(fd->fd_sys, 0, SEEK_END); printf("%d: %s at %d: skip_read=%d min_st_loc=%lld fsize=%lld\n",myrank,__func__,__LINE__,fd->skip_read,min_st_loc,fsize); } + + /* For aggregators, calculate the portions of all other ranks' requests + * fall into this aggregator's file domain (note only I/O aggregators are + * assigned file domains). + * + * Inter-process communication is required to construct others_req[], + * including MPI_Alltoall, MPI_Issend, MPI_Irecv, and MPI_Waitall. + */ + LUSTRE_Calc_others_req(fd, my_req, &others_req); + + /* Two-phase I/O: first communication phase to exchange write data from all + * ranks to the I/O aggregators, followed by the write phase where only I/O + * aggregators write to the file. + * + * Unless MPI_Alltoallw() is used (when use_alltoallw is set to 1), there + * is no collective MPI communication beyond this point, as + * LUSTRE_Exch_and_write() calls only MPI_Issend, MPI_Irecv, and + * MPI_Waitall. Thus it is safe for those non-aggregators making zero-sized + * request to skip the call. + */ + + /* if this rank has data to write, then participate exchange-and-write */ + do_ex_wr = (buf_view.size == 0) ? 0 : 1; + use_alltoallw = 0; + +#ifdef USE_MPI_ALLTOALLW + { + /* When num_nodes < striping_factor, using MPI_Alltoallw in + * commit_comm_phase() is faster than MPI_Issend/MPI_Irecv ... ? + */ + char *env_str; + if ((env_str = getenv("PNETCDF_USE_ALLTOALLW")) != NULL) + use_alltoallw = (strcasecmp(env_str, "true") == 0) ? 1: 0; + } +#endif + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->write_timing[1] += MPI_Wtime() - curT; +#endif + + if (do_ex_wr || fd->is_agg) + /* This rank participates exchange and write only when it has non-zero + * data to write or is an I/O aggregator + */ + w_len = LUSTRE_Exch_and_write(fd, buf, buf_view, others_req, my_req, + min_st_loc, max_end_loc, buf_idx); + + /* free all memory allocated */ + NCI_Free(others_req[0].offsets); + NCI_Free(others_req); + + if (buf_idx != NULL) { + NCI_Free(buf_idx[0]); + NCI_Free(buf_idx); + } + NCI_Free(my_req[0].offsets); + NCI_Free(my_req); + +#ifdef ADJUST_STRIPING_UNIT + /* restore the original striping_unit */ + fd->hints->striping_unit = orig_striping_unit; +#endif + + /* If this collective write is followed by an independent write, it's + * possible to have those subsequent writes on other processes race ahead + * and sneak in before the read-modify-write completes. We carry out a + * collective communication at the end here so no one can start independent + * I/O before collective I/O completes. + * + * need to do some gymnastics with the error codes so that if something + * went wrong, all processes report error, but if a process has a more + * specific error code, we can still have that process report the + * additional information + */ + /* optimization: if only one process performing I/O, we can perform + * a less-expensive Bcast. */ + if (fd->hints->cb_nodes == 1) + MPI_Bcast(&w_len, 1, MPI_OFFSET, fd->hints->ranklist[0], fd->comm); + else + MPI_Allreduce(MPI_IN_PLACE, &w_len, 1, MPI_OFFSET, MPI_MIN, fd->comm); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->write_timing[0] += MPI_Wtime() - curT; +#endif + + /* restore flattend file view before leaving this sibroutine */ + fd->flat_file = saved_flat_file; + + /* w_len may not be the same as buf_view.size, because data sieving may + * write more than requested. + */ + return buf_view.size; +} + +static +void comm_phase_alltoallw(PNCIO_File *fd, + disp_len_list *send_list, /* [cb_nodes] */ + disp_len_list *recv_list) /* [nprocs] */ +{ + /* This subroutine performs the sam communication tasks as the below + * commit_comm_phase(), but using MPI_Alltoallw() instead of MPI_Issend and + * MPI_Irecv. + * + * It creates a datatype combining all displacement-length + * pairs in each element of send_list[]. The datatype is used when calling + * MPI_Issend to send write data to the I/O aggregators. Similarly, it + * creates a datatype combining all displacement-length pairs in each + * element of recv_list[] and uses it when calling MPI_Irecv or MPI_Recv + * to receive write data from all processes. + */ + int i, nprocs, rank; + size_t alloc_sz; + MPI_Datatype *sendTypes, *recvTypes; + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &rank); + + /* calculate send/recv derived types metadata */ +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *sendCounts, *recvCounts; + MPI_Aint *sdispls, *rdispls; + alloc_sz = sizeof(MPI_Count) + sizeof(MPI_Aint); + sendCounts = (MPI_Count*) NCI_Calloc(nprocs * 2, alloc_sz); + sdispls = (MPI_Aint*) (sendCounts + (nprocs * 2)); +#else + int *sendCounts, *recvCounts, *sdispls, *rdispls; + alloc_sz = sizeof(int) * 2; + sendCounts = (int*) NCI_Calloc(nprocs * 2, alloc_sz); + sdispls = (int*) (sendCounts + (nprocs * 2)); +#endif + recvCounts = sendCounts + nprocs; + rdispls = sdispls + nprocs; + + /* allocate send/recv derived type arrays */ + sendTypes = (MPI_Datatype*)NCI_Malloc(sizeof(MPI_Datatype) * nprocs * 2); + recvTypes = sendTypes + nprocs; + + for (i=0; iis_agg && recv_list != NULL) { + for (i=0; ihints->cb_nodes; i++) { + /* check if nothing to send or if self */ + if (send_list[i].count == 0 || i == fd->my_cb_nodes_index) continue; + + int dest = fd->hints->ranklist[i]; + sendCounts[dest] = 1; + + /* combine reqs using new datatype */ +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Type_create_hindexed_c(send_list[i].count, send_list[i].len, + send_list[i].disp, MPI_BYTE, + &sendTypes[dest]); +#else + MPI_Type_create_hindexed(send_list[i].count, send_list[i].len, + send_list[i].disp, MPI_BYTE, + &sendTypes[dest]); +#endif + MPI_Type_commit(&sendTypes[dest]); + } + +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Alltoallw_c(MPI_BOTTOM, sendCounts, sdispls, sendTypes, + MPI_BOTTOM, recvCounts, rdispls, recvTypes, fd->comm); +#else + MPI_Alltoallw(MPI_BOTTOM, sendCounts, sdispls, sendTypes, + MPI_BOTTOM, recvCounts, rdispls, recvTypes, fd->comm); +#endif + + for (i=0; ihints->cb_nodes; i++) + send_list[i].count = 0; + + if (recv_list != NULL) + for (i = 0; i < nprocs; i++) + recv_list[i].count = 0; +} + +static +void commit_comm_phase(PNCIO_File *fd, + disp_len_list *send_list, /* [cb_nodes] */ + disp_len_list *recv_list) /* [nprocs] */ +{ + /* This subroutine creates a datatype combining all displacement-length + * pairs in each element of send_list[]. The datatype is used when calling + * MPI_Issend to send write data to the I/O aggregators. Similarly, it + * creates a datatype combining all displacement-length pairs in each + * element of recv_list[] and uses it when calling MPI_Irecv or MPI_Recv + * to receive write data from all processes. + */ + int i, nprocs, rank, nreqs; + MPI_Request *reqs; + MPI_Datatype sendType, recvType; +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + int j; + double dtype_time=MPI_Wtime(); +#endif + + if (use_alltoallw) + return comm_phase_alltoallw(fd, send_list, recv_list); + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &rank); + + nreqs = fd->hints->cb_nodes; + nreqs += (fd->is_agg) ? nprocs : 0; + reqs = (MPI_Request *)NCI_Malloc(sizeof(MPI_Request) * nreqs); + nreqs = 0; + + /* receiving part */ +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + /* recv buffer type profiling */ + int nrecvs=0; + MPI_Offset max_r_amnt=0, max_r_count=0; +#endif + + if (fd->is_agg && recv_list != NULL) { + for (i = 0; i < nprocs; i++) { + /* check if nothing to receive or if self */ + if (recv_list[i].count == 0 || i == rank) continue; + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + MPI_Offset r_amnt=0; + for (j=0; jatomicity) { /* Blocking Recv */ + MPI_Status status; + MPI_Recv(MPI_BOTTOM, 1, recvType, i, 0, fd->comm, &status); + } + else + MPI_Irecv(MPI_BOTTOM, 1, recvType, i, 0, fd->comm, + &reqs[nreqs++]); + MPI_Type_free(&recvType); + } + } + + /* send reqs */ +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + /* send buffer type profiling */ + int nsends=0; + MPI_Offset max_s_amnt=0, max_s_count=0; +#endif + + for (i = 0; i < fd->hints->cb_nodes; i++) { + /* check if nothing to send or if self */ + if (send_list[i].count == 0 || i == fd->my_cb_nodes_index) continue; + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + MPI_Offset s_amnt=0; + for (j=0; jhints->ranklist[i], 0, + fd->comm, &reqs[nreqs++]); + MPI_Type_free(&sendType); + } + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + fd->write_timing[4] += MPI_Wtime() - dtype_time; + +/* + fd->write_counter[2] = MAX(fd->write_counter[2], nsends); + fd->write_counter[3] = MAX(fd->write_counter[3], nrecvs); + fd->write_counter[4] = MAX(fd->write_counter[4], max_r_amnt); + fd->write_counter[5] = MAX(fd->write_counter[5], max_s_amnt); + fd->write_counter[6] = MAX(fd->write_counter[6], max_r_count); + fd->write_counter[7] = MAX(fd->write_counter[7], max_s_count); +*/ +#endif + + if (nreqs > 0) { +#ifdef HAVE_MPI_STATUSES_IGNORE + MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE); +#else + MPI_Status *statuses = (MPI_Status *) + NCI_Malloc(nreqs * sizeof(MPI_Status)); + MPI_Waitall(nreqs, reqs, statuses); + NCI_Free(statuses); +#endif + } + + NCI_Free(reqs); + + /* clear send_list and recv_list for future reuse */ + for (i = 0; i < fd->hints->cb_nodes; i++) + send_list[i].count = 0; + + if (recv_list != NULL) + for (i = 0; i < nprocs; i++) + recv_list[i].count = 0; +} + +/*----< LUSTRE_Exch_and_write() >--------------------------------------------*/ +/* Each process sends all its write requests to I/O aggregators based on the + * file domain assignment to the aggregators. In this implementation, a file is + * first divided into stripes which are assigned to the aggregators in a + * round-robin fashion. The "exchange" of write data from non-aggregators to + * aggregators is carried out in 'ntimes' rounds. Each round covers an + * aggregate file region of size equal to the file stripe size times the number + * of I/O aggregators. The file writes are carried out in every 'nbufs' + * iterations, where 'nbufs' == cb_buffer_size / file stripe size. This approach + * is different from ROMIO's implementation as in MPICH 4.2.3. + * + * Other implementations developers are referring to the paper: Wei-keng Liao, + * and Alok Choudhary. "Dynamically Adapting File Domain Partitioning Methods + * for Collective I/O Based on Underlying Parallel File System Locking + * Protocols", in The Supercomputing Conference, 2008. + */ +static +MPI_Offset LUSTRE_Exch_and_write(PNCIO_File *fd, + const void *buf, + PNCIO_View buf_view, + PNCIO_Access *others_req, + PNCIO_Access *my_req, + MPI_Offset min_st_loc, + MPI_Offset max_end_loc, + MPI_Offset **buf_idx) +{ + char **write_buf = NULL, **recv_buf = NULL, **send_buf = NULL; + size_t alloc_sz; + int nprocs, myrank, nbufs, ibuf, batch_idx=0, cb_nodes, striping_unit; + MPI_Count i, j, m, ntimes; + MPI_Count **recv_size=NULL, **recv_count=NULL; + MPI_Count **recv_start_pos=NULL, *send_size; + MPI_Offset end_loc, req_off, iter_end_off, *off_list, step_size; + MPI_Offset *this_buf_idx=NULL; + off_len_list *srt_off_len = NULL; + disp_len_list *send_list = NULL, *recv_list = NULL; + MPI_Offset w_len, total_w_len=0; + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + + cb_nodes = fd->hints->cb_nodes; + striping_unit = fd->hints->striping_unit; + + /* The aggregate access region (across all processes) of this collective + * write starts from min_st_loc and ends at max_end_loc. The collective + * write is carried out in 'ntimes' rounds of two-phase I/O. Each round + * covers an aggregate file region of size 'step_size' written only by + * cb_nodes number of I/O aggregators. Note non-aggregators must also + * participate all ntimes rounds to send their requests to I/O aggregators. + * + * step_size = the number of I/O aggregators x striping_unit + * + * Note the number of write phases = ntimes / nbufs, as writes (and + * communication) are accumulated for nbufs rounds before flushed. + */ + step_size = (MPI_Offset)cb_nodes * striping_unit; + + /* align min_st_loc downward to the nearest file stripe boundary */ + min_st_loc -= min_st_loc % (MPI_Offset) striping_unit; + + /* ntimes is the number of rounds of two-phase I/O */ + ntimes = (max_end_loc - min_st_loc + 1) / step_size; + if ((max_end_loc - min_st_loc + 1) % step_size) + ntimes++; + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + fd->write_counter[0] = MAX(fd->write_counter[0], ntimes); +#endif + + /* collective buffer is divided into 'nbufs' sub-buffers. Each sub-buffer + * is of size equal to Lustre stripe size. Write data of non-aggregators + * are sent to aggregators and stored in aggregators' sub-buffers, one for + * each round. All nbufs sub-buffers are altogether flushed to file every + * nbufs rounds. + * + * fd->hints->cb_buffer_size, collective buffer size, for Lustre must be at + * least striping_unit. This requirement has been checked at the file + * open/create time when fd->io_buf is allocated. + * + * Note cb_buffer_size and striping_unit may also be adjusted earlier in + * PNCIO_LUSTRE_WriteStridedColl(). + */ + nbufs = fd->hints->cb_buffer_size / striping_unit; + assert(nbufs > 0); /* must at least 1 */ + + /* in case number of rounds is less than nbufs */ + nbufs = (ntimes < nbufs) ? (int)ntimes : nbufs; + + /* off_list[m] is the starting file offset of this aggregator's write + * region in iteration m (file domain of iteration m). This offset + * may not be aligned with file stripe boundaries. + * end_loc is the ending file offset of this aggregator's file domain. + */ + off_list = (MPI_Offset *) NCI_Malloc(ntimes * sizeof(MPI_Offset)); + end_loc = -1; + for (m = 0; m < ntimes; m++) + off_list[m] = max_end_loc; + for (i = 0; i < nprocs; i++) { +// if (myrank == 0) printf("%s at %d: others_req[%d] count=%lld\n",__func__,__LINE__, i,others_req[i].count); + for (j = 0; j < others_req[i].count; j++) { + req_off = others_req[i].offsets[j]; + m = (int) ((req_off - min_st_loc) / step_size); + off_list[m] = MIN(off_list[m], req_off); + end_loc = MAX(end_loc, (others_req[i].offsets[j] + others_req[i].lens[j] - 1)); + } + } +// if (myrank == 0) printf("%s at %d: end_loc=%lld nbufs=%d recv_list=%s\n",__func__,__LINE__, end_loc,nbufs,(recv_list==NULL)?"NULL":"NOT NULL"); + + /* Allocate displacement-length pair arrays, describing the send buffer. + * send_list[i].count: number displacement-length pairs. + * send_list[i].len: length in bytes. + * send_list[i].disp: displacement (send buffer address). + */ + send_list = (disp_len_list*) NCI_Malloc(sizeof(disp_len_list) * cb_nodes); + for (i = 0; i < cb_nodes; i++) { + send_list[i].count = 0; +#ifdef HAVE_MPI_LARGE_COUNT + alloc_sz = sizeof(MPI_Count) * 2; + send_list[i].disp = (MPI_Count*) NCI_Malloc(alloc_sz * nbufs); + send_list[i].len = send_list[i].disp + nbufs; +#else + alloc_sz = sizeof(MPI_Aint) + sizeof(int); + send_list[i].disp = (MPI_Aint*) NCI_Malloc(alloc_sz * nbufs); + send_list[i].len = (int*) (send_list[i].disp + nbufs); +#endif + } + + /* end_loc >= 0 indicates this process has something to write to the file. + * Only I/O aggregators can have end_loc > 0. write_buf is the collective + * buffer and only matter for I/O aggregators. recv_buf is the buffer used + * only by aggregators to receive requests from non-aggregators. Its size + * may be larger then the file stripe size, in case when writes from + * non-aggregators overlap. In this case, it will be realloc-ed in + * LUSTRE_W_Exchange_data(). The received data is later copied over to + * write_buf, whose contents will be written to file. + */ + if (end_loc >= 0 && nbufs > 0) { + /* Allocate displacement-length pair arrays, describing the recv buffer. + * recv_list[i].count: number displacement-length pairs. + * recv_list[i].len: length in bytes. + * recv_list[i].disp: displacement (recv buffer address). + */ + assert(fd->is_agg); + + recv_list = (disp_len_list*) NCI_Malloc(sizeof(disp_len_list) * nprocs); + for (i = 0; i < nprocs; i++) { + recv_list[i].count = 0; +#ifdef HAVE_MPI_LARGE_COUNT + alloc_sz = sizeof(MPI_Count) * 2; + recv_list[i].disp = (MPI_Count*) NCI_Malloc(alloc_sz * nbufs); + recv_list[i].len = recv_list[i].disp + nbufs; +#else + alloc_sz = sizeof(MPI_Aint) + sizeof(int); + recv_list[i].disp = (MPI_Aint*) NCI_Malloc(alloc_sz * nbufs); + recv_list[i].len = (int*) (recv_list[i].disp + nbufs); +#endif + } + + /* collective buffer was allocated at file open/create. For Lustre, its + * size must be at least striping_unit, which has been checked at the + * time fd->io_buf is allocated. + */ + assert(fd->io_buf != NULL); + + /* divide collective buffer into nbufs sub-buffers */ + write_buf = (char **) NCI_Malloc(nbufs * sizeof(char*)); + write_buf[0] = fd->io_buf; + + /* Similarly, receive buffer consists of nbufs sub-buffers */ + recv_buf = (char **) NCI_Malloc(nbufs * sizeof(char*)); + recv_buf[0] = (char *) NCI_Malloc(striping_unit); + + /* recv_count[j][i] is the number of off-len pairs to be received from + * each proc i in round j + */ + recv_count = (MPI_Count**) NCI_Malloc(3 * nbufs * sizeof(MPI_Count*)); + recv_count[0] = (MPI_Count*) NCI_Malloc(3 * nbufs * nprocs * sizeof(MPI_Count)); + + /* recv_size[j][i] is the receive size from proc i in round j */ + recv_size = recv_count + nbufs; + recv_size[0] = recv_count[0] + nbufs * nprocs; + + /* recv_start_pos[j][i] is the starting index of offset-length arrays + * pointed by others_req[i].curr for remote rank i in round j + */ + recv_start_pos = recv_size + nbufs; + recv_start_pos[0] = recv_size[0] + nbufs * nprocs; + + for (j = 1; j < nbufs; j++) { + write_buf[j] = write_buf[j-1] + striping_unit; + /* recv_buf[j] may be realloc in LUSTRE_W_Exchange_data() */ + recv_buf[j] = (char *) NCI_Malloc(striping_unit); + recv_count[j] = recv_count[j-1] + nprocs; + recv_size[j] = recv_size[j-1] + nprocs; + recv_start_pos[j] = recv_start_pos[j-1] + nprocs; + } + + /* srt_off_len consists of file offset-length pairs sorted in a + * monotonically non-decreasing order (required by MPI-IO standard) + * which is used when writing to the file + */ + srt_off_len = (off_len_list*) NCI_Malloc(nbufs * sizeof(off_len_list)); + } + + /* send_buf[] will be allocated in LUSTRE_W_Exchange_data(), when the use + * buffer is not contiguous. + */ + send_buf = (char **) NCI_Malloc(nbufs * sizeof(char*)); + + /* this_buf_idx contains indices to the user write buffer for sending this + * rank's write data to aggregators, one for each aggregator. It is used + * only when user buffer is contiguous. + */ + if (buf_view.is_contig) + this_buf_idx = (MPI_Offset *) NCI_Malloc(sizeof(MPI_Offset) * cb_nodes); + + /* array of data sizes to be sent to each aggregator in a 2-phase round */ + send_size = (MPI_Count *) NCI_Calloc(cb_nodes, sizeof(MPI_Count)); + + /* min_st_loc is the beginning file offsets of the aggregate access region + * of this collective write, and it has been downward aligned to the + * nearest file stripe boundary + * iter_end_off is the ending file offset of aggregate write region of + * iteration m, upward aligned to the file stripe boundary. + */ + iter_end_off = min_st_loc + step_size; + + ibuf = 0; + for (m = 0; m < ntimes; m++) { + MPI_Count range_size; + MPI_Offset range_off; + + /* Note that MPI standard (MPI 3.1 Chapter 13.1.1 and MPI 4.0 Chapter + * 14.1.1) requires that the typemap displacements of etype and + * filetype are non-negative and monotonically non-decreasing. This + * simplifies implementation a bit compared to reads. + */ + + /* Calculate what should be communicated. + * + * First, calculate the amount to be sent to each aggregator i, at this + * round m, by going through all offset-length pairs in my_req[i]. + * + * iter_end_off - ending file offset of aggregate write region of this + * round, and upward aligned to the file stripe + * boundary. Note the aggregate write region of this + * round starts from (iter_end_off-step_size) to + * iter_end_off, aligned with file stripe boundaries. + * send_size[i] - total size in bytes of this process's write data + * fall into aggregator i's FD in this round. + * recv_size[m][i] - size in bytes of data to be received by this + * aggregator from process i in round m. + * recv_count[m][i] - number of noncontiguous offset-length pairs from + * process i fall into this aggregator's write region + * in round m. + */ + for (i = 0; i < cb_nodes; i++) { + /* reset communication metadata to all 0s for this round */ + send_size[i] = 0; + + if (my_req[i].count == 0) continue; + /* my_req[i].count is the number of this rank's offset-length pairs + * to be sent to aggregator i + */ + + if (my_req[i].curr == my_req[i].count) + continue; /* done with aggregator i */ + + if (buf_view.is_contig) + /* buf_idx is used only when user buffer is contiguous. + * this_buf_idx[i] points to the starting offset of user + * buffer, buf, for amount of send_size[i] to be sent to + * aggregator i at this round. + */ + this_buf_idx[i] = buf_idx[i][my_req[i].curr]; + + /* calculate the send amount from this rank to aggregator i */ + for (j = my_req[i].curr; j < my_req[i].count; j++) { + if (my_req[i].offsets[j] < iter_end_off) + send_size[i] += my_req[i].lens[j]; + else + break; + } + + /* update my_req[i].curr to point to the jth offset-length + * pair of my_req[i], which will be used as the first pair in the + * next round of iteration. + */ + my_req[i].curr = j; + } + + /* range_off is the starting file offset of this aggregator's write + * region at this round (may not be aligned to stripe boundary). + * range_size is the size (in bytes) of this aggregator's write region + * for this round (whose size is always <= striping_unit). + */ + range_off = off_list[m]; + range_size = MIN(striping_unit - range_off % striping_unit, + end_loc - range_off + 1); + + /* Calculate the amount to be received from each process i at this + * round, by going through all offset-length pairs of others_req[i]. + */ + if (recv_count != NULL) { + for (i=0; iis_agg) fd->write_timing[3] += MPI_Wtime() - curT; +#endif + + /* free send_buf allocated in LUSTRE_W_Exchange_data() */ + for (j = 0; j < numBufs; j++) { + if (send_buf[j] != NULL) { + NCI_Free(send_buf[j]); + send_buf[j] = NULL; + } + } + if (!fd->is_agg) /* non-aggregators are done for this batch */ + continue; + + if (recv_list == NULL) /* this aggregator has nothing to write */ + continue; + + /* this aggregator unpacks the data in recv_buf[] into write_buf */ + if (end_loc >= 0) { + for (j = 0; j < numBufs; j++) { + char *buf_ptr = recv_buf[j]; + for (i = 0; i < nprocs; i++) { + if (recv_count[j][i] > 1 && i != myrank) { + /* When recv_count[j][i] == 1, this case has + * been taken care of earlier by receiving the + * message directly into write_buf. + */ + MEMCPY_UNPACK(i, buf_ptr, recv_start_pos[j][i], + recv_count[j][i], write_buf[j]); + buf_ptr += recv_size[j][i]; + } + } + } + } + + /* this aggregator writes to numBufs number of stripes */ + for (j=0; j 1, + * data sieving is not performed and holes have been found. In + * this case, srt_off_len[] is the list of sorted offset-length + * pairs describing noncontiguous writes. Now call writes for + * each offset-length pair. Note the offset-length pairs + * (represented by srt_off_len[j].off, srt_off_len[j].len, and + * srt_off_len[j].num) have been coalesced in + * LUSTRE_W_Exchange_data(). + */ +// printf("%s at %d: num=%d\n",__func__,__LINE__, srt_off_len[j].num); + for (i = 0; i < srt_off_len[j].num; i++) { + /* all write requests in this round should fall into file + * range of [range_off, range_off+range_size). This below + * assertion should never fail. + */ + assert(srt_off_len[j].off[i] < range_off + range_size && + srt_off_len[j].off[i] >= range_off); + +// printf("%s at %d: PNCIO_WriteContig num=%d [%d] off=%lld len=%lld\n",__func__,__LINE__, srt_off_len[j].num,i,srt_off_len[j].off[i],srt_off_len[j].len[i]); + w_len = PNCIO_WriteContig(fd, + write_buf[j] + (srt_off_len[j].off[i] - range_off), + srt_off_len[j].len[i], + srt_off_len[j].off[i]); + if (w_len < 0) goto over; + total_w_len += w_len; + } + if (srt_off_len[j].num > 0) { + NCI_Free(srt_off_len[j].off); + srt_off_len[j].num = 0; + } + } + batch_idx += numBufs; /* only matters for aggregators */ + } + } + + over: + if (srt_off_len) + NCI_Free(srt_off_len); + if (write_buf != NULL) + NCI_Free(write_buf); + if (recv_buf != NULL) { + for (j = 0; j < nbufs; j++) + NCI_Free(recv_buf[j]); + NCI_Free(recv_buf); + } + if (recv_count != NULL) { + NCI_Free(recv_count[0]); + NCI_Free(recv_count); + } + NCI_Free(send_size); + NCI_Free(off_list); + if (buf_view.is_contig) + NCI_Free(this_buf_idx); + if (send_buf != NULL) + NCI_Free(send_buf); + if (send_list != NULL) { + for (i = 0; i < cb_nodes; i++) + NCI_Free(send_list[i].disp); + NCI_Free(send_list); + } + if (recv_list != NULL) { + for (i = 0; i < nprocs; i++) + NCI_Free(recv_list[i].disp); + NCI_Free(recv_list); + } + +#ifdef WKL_DEBUG + /* check any pending messages to be received */ + MPI_Status probe_st; + int probe_flag; + MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, fd->comm, &probe_flag, &probe_st); + if (probe_flag) { + printf("ERROR ++++ MPI_Iprobe rank=%4d is_agg=%d: ---- cb_nodes=%d ntimes=%lld nbufs=%d\n",myrank,fd->is_agg,cb_nodes,ntimes,nbufs); + fflush(stdout); + } +#endif + return total_w_len; +} + +/* This heap-merge sort also coalesces sorted offset-length pairs whenever + * possible. + * + * Heapify(a, i, heapsize); Algorithm from Cormen et al. pg. 143 modified for a + * heap with smallest element at root. The recursion has been removed so that + * there are no function calls. Function calls are too expensive. + */ +static +void heap_merge(const PNCIO_Access *others_req, + const MPI_Count *count, +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *srt_off, + MPI_Count *srt_len, +#else + MPI_Offset *srt_off, + int *srt_len, +#endif + const MPI_Count *start_pos, + int nprocs, + int nprocs_recv, + MPI_Count *total_elements) +{ + typedef struct { + MPI_Offset *off_list; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset *len_list; +#else + int *len_list; +#endif + MPI_Count nelem; + } heap_struct; + + heap_struct *a, tmp; + int i, j, heapsize, l, r, k, smallest; + + a = (heap_struct *) NCI_Malloc((nprocs_recv + 1) * sizeof(heap_struct)); + + j = 0; + for (i = 0; i < nprocs; i++) { + if (count[i]) { + a[j].off_list = others_req[i].offsets + start_pos[i]; + a[j].len_list = others_req[i].lens + start_pos[i]; + a[j].nelem = count[i]; + j++; + } + } + +#define SWAP(x, y, tmp) { tmp = x ; x = y ; y = tmp ; } + + heapsize = nprocs_recv; + + /* Build a heap out of the first element from each list, with the smallest + * element of the heap at the root. The first for loop is to find and move + * the smallest a[*].off_list[0] to a[0]. + */ + for (i = heapsize / 2 - 1; i >= 0; i--) { + k = i; + for (;;) { + r = 2 * (k + 1); + l = r - 1; + if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list))) + smallest = l; + else + smallest = k; + + if ((r < heapsize) && (*(a[r].off_list) < *(a[smallest].off_list))) + smallest = r; + + if (smallest != k) { + SWAP(a[k], a[smallest], tmp); + k = smallest; + } else + break; + } + } + + /* The heap keeps the smallest element in its first element, i.e. + * a[0].off_list[0]. + */ + j = 0; + for (i = 0; i < *total_elements; i++) { + /* extract smallest element from heap, i.e. the root */ + if (j == 0 || srt_off[j - 1] + srt_len[j - 1] < *(a[0].off_list)) { + srt_off[j] = *(a[0].off_list); + srt_len[j] = *(a[0].len_list); + j++; + } else { + /* this offset-length pair can be coalesced into the previous one */ + srt_len[j - 1] = *(a[0].off_list) + *(a[0].len_list) - srt_off[j - 1]; + } + (a[0].nelem)--; + + if (a[0].nelem) { + (a[0].off_list)++; + (a[0].len_list)++; + } else { + a[0] = a[heapsize - 1]; + heapsize--; + } + + /* Heapify(a, 0, heapsize); */ + k = 0; + for (;;) { + r = 2 * (k + 1); + l = r - 1; + if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list))) + smallest = l; + else + smallest = k; + + if ((r < heapsize) && (*(a[r].off_list) < *(a[smallest].off_list))) + smallest = r; + + if (smallest != k) { + SWAP(a[k], a[smallest], tmp); + k = smallest; + } else + break; + } + } + NCI_Free(a); + *total_elements = j; +} + +#define CACHE_REQ(list, nelems, buf) { \ + MPI_Aint buf_addr; \ + list.len[list.count] = nelems; \ + MPI_Get_address(buf, &buf_addr); \ + list.disp[list.count] = buf_addr; \ + list.count++; \ +} + +static +int Exchange_data_recv( + PNCIO_File *fd, + const void *buf, /* user buffer */ + char *write_buf, /* OUT: internal buffer used to write + * to file */ + char **recv_buf, /* OUT: [nbufs] internal buffer used to + * receive from other processes */ + const PNCIO_View *buf_view, /* IN: flattened buffer + * offset-length pairs */ + const MPI_Count *recv_size, /* [nprocs] recv_size[i] is amount of + * this aggregator recv from rank i */ + MPI_Offset range_off, /* starting file offset of this + * aggregator's write region */ + MPI_Count range_size, /* amount of this aggregator's write + * region */ + const MPI_Count *recv_count, /* [nprocs] recv_count[i] is the number + * of offset-length pairs received from + * rank i */ + const MPI_Count *start_pos, /* [nprocs] start_pos[i] starting value + * of others_req[i].curr */ + const PNCIO_Access *others_req, /* [nprocs] others_req[i] is rank i's + * write requests fall into this + * aggregator's file domain */ + const MPI_Offset *buf_idx, /* [cb_nodes] indices to user buffer + * offsets for sending this rank's + * write data to aggregator i */ + off_len_list *srt_off_len, /* OUT: list of write offset-length + * pairs of this aggregator */ + disp_len_list *recv_list) /* OUT: displacement-length pairs of + * recv buffer */ +{ + char *buf_ptr, *contig_buf; + size_t alloc_sz; + int i, j, nprocs, myrank, nprocs_recv, hole, build_srt_off_len; + MPI_Count sum_recv; + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + + /* srt_off_len contains the file offset-length pairs to be written by this + * aggregator at this round. The file region starts from range_off with + * size of range_size. + */ + + srt_off_len->num = 0; + srt_off_len->off = NULL; + sum_recv = 0; + nprocs_recv = 0; + + /* calculate receive metadata */ + j = -1; + for (i = 0; i < nprocs; i++) { + srt_off_len->num += recv_count[i]; + if (j == -1 && recv_count[i] > 0) j = i; + sum_recv += recv_size[i]; + if (recv_size[i]) + nprocs_recv++; + } + + if (nprocs_recv == 0) return NC_NOERR; + +// MPI_Count numx = srt_off_len->num; printf("nprocs_recv=%d PNCIO_DS_WR_NAGGRS_LB=%d srt_off_len->num=%lld PNCIO_DS_WR_NPAIRS_LB=%d\n",nprocs_recv,PNCIO_DS_WR_NAGGRS_LB,srt_off_len->num,PNCIO_DS_WR_NPAIRS_LB); + + /* determine whether checking holes is necessary */ + if (srt_off_len->num == 0) { + /* this process has nothing to receive and hence no hole */ + build_srt_off_len = 0; + hole = 0; + } else if (srt_off_len->num == 1) { + build_srt_off_len = 0; + hole = 0; +#ifdef HAVE_MPI_LARGE_COUNT + alloc_sz = sizeof(MPI_Offset) + sizeof(MPI_Count); + srt_off_len->off = (MPI_Offset*) NCI_Malloc(alloc_sz); + srt_off_len->len = (MPI_Count*) (srt_off_len->off + 1); +#else + alloc_sz = sizeof(MPI_Offset) + sizeof(int); + srt_off_len->off = (MPI_Offset*) NCI_Malloc(alloc_sz); + srt_off_len->len = (int*) (srt_off_len->off + 1); +#endif + srt_off_len->off[0] = others_req[j].offsets[start_pos[j]]; + srt_off_len->len[0] = others_req[j].lens[start_pos[j]]; + } else if (fd->hints->romio_ds_write == PNCIO_HINT_ENABLE) { + /* skip building of srt_off_len and proceed to read-modify-write */ + build_srt_off_len = 0; + /* assuming there are holes */ + hole = 1; + } else if (fd->hints->romio_ds_write == PNCIO_HINT_AUTO) { + if (DO_HEAP_MERGE(nprocs_recv, srt_off_len->num)) { + /* When the number of sorted offset-length lists or the total + * number of offset-length pairs are too large, the heap-merge sort + * below for building srt_off_len can become very expensive. Such + * sorting is also used to check holes to determine whether + * read-modify-write is necessary. + */ + build_srt_off_len = 0; + /* assuming there are holes */ + hole = 1; + } + else /* heap-merge is less expensive, proceed to build srt_off_len */ + build_srt_off_len = 1; + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (build_srt_off_len) { + fd->write_counter[1]++; + fd->write_counter[2] = MAX(fd->write_counter[2], srt_off_len->num); + fd->write_counter[3] = MAX(fd->write_counter[3], nprocs_recv); + } else { + fd->write_counter[4]++; + fd->write_counter[5] = MAX(fd->write_counter[5], srt_off_len->num); + fd->write_counter[6] = MAX(fd->write_counter[6], nprocs_recv); + } +#endif + } else { /* if (fd->hints->romio_ds_write == PNCIO_HINT_DISABLE) */ + /* User explicitly disable data sieving to skip read-modify-write. + * Whether or not there is a hole is not important. However, + * srt_off_len must be constructed to merge all others_req[] into a + * single sorted list. This step is necessary because after this + * subroutine returns, write data from all non-aggregators will be + * packed into the write_buf, with a possibility of overlaps, and + * as srt_off_len stores the coalesced offset-length pairs of + * individual non-contiguous write requests, it is used to write them + * to the file. + */ + build_srt_off_len = 1; + } + + if (build_srt_off_len) { + /* merge all the offset-length pairs from others_req[] (already sorted + * individually) into a single list of offset-length pairs. + */ +#ifdef HAVE_MPI_LARGE_COUNT + alloc_sz = sizeof(MPI_Offset) + sizeof(MPI_Count); + srt_off_len->off = (MPI_Offset*) NCI_Malloc(alloc_sz * srt_off_len->num); + srt_off_len->len = (MPI_Count*) (srt_off_len->off + srt_off_len->num); +#else + alloc_sz = sizeof(MPI_Offset) + sizeof(int); + srt_off_len->off = (MPI_Offset*) NCI_Malloc(alloc_sz * srt_off_len->num); + srt_off_len->len = (int*) (srt_off_len->off + srt_off_len->num); +#endif + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + double curT = MPI_Wtime(); +#endif + heap_merge(others_req, recv_count, srt_off_len->off, srt_off_len->len, + start_pos, nprocs, nprocs_recv, &srt_off_len->num); + + /* Now, (srt_off_len->off and srt_off_len->len) are in an increasing + * order of file offsets. In addition, they are coalesced. + */ +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + fd->write_timing[5] += MPI_Wtime() - curT; +#endif + /* whether or not there are holes */ + hole = (srt_off_len->num > 1); + } + +// printf("%s at %d: romio_ds_write=%s build_srt_off_len=%d hole=%d skip_read=%d srt_off_len->num=%lld\n",__func__,__LINE__, (fd->hints->romio_ds_write == PNCIO_HINT_ENABLE)?"ENABLE": (fd->hints->romio_ds_write == PNCIO_HINT_DISABLE)?"DISABLE":"AUTO", build_srt_off_len,hole,fd->skip_read,srt_off_len->num); +// printf("%s at %d: romio_ds_write=%s build_srt_off_len=%d hole=%d nprocs_recv=%d(PNCIO_DS_WR_NAGGRS_LB=%d) numx=%lld(PNCIO_DS_WR_NPAIRS_LB=%d)\n",__func__,__LINE__, (fd->hints->romio_ds_write == PNCIO_HINT_ENABLE)?"ENABLE": (fd->hints->romio_ds_write == PNCIO_HINT_DISABLE)?"DISABLE":"AUTO", build_srt_off_len,hole,nprocs_recv,PNCIO_DS_WR_NAGGRS_LB,numx,PNCIO_DS_WR_NPAIRS_LB); + + /* data sieving */ + if (fd->hints->romio_ds_write != PNCIO_HINT_DISABLE && hole) { + if (fd->skip_read) + memset(write_buf, 0, range_size); + else { + MPI_Offset r_len; + r_len = PNCIO_ReadContig(fd, write_buf, range_size, range_off); + if (r_len < 0) return (int)r_len; + } + + /* Once read, holes have been filled and thus the number of + * offset-length pairs, srt_off_len->num, becomes one. + */ + srt_off_len->num = 1; + if (srt_off_len->off == NULL) { /* if has not been malloc-ed yet */ +#ifdef HAVE_MPI_LARGE_COUNT + alloc_sz = sizeof(MPI_Offset) + sizeof(MPI_Count); + srt_off_len->off = (MPI_Offset*) NCI_Malloc(alloc_sz); + srt_off_len->len = (MPI_Count*) (srt_off_len->off + 1); +#else + alloc_sz = sizeof(MPI_Offset) + sizeof(int); + srt_off_len->off = (MPI_Offset*) NCI_Malloc(alloc_sz); + srt_off_len->len = (int*) (srt_off_len->off + 1); +#endif + } + srt_off_len->off[0] = range_off; + srt_off_len->len[0] = range_size; + } + + /* It is possible sum_recv (sum of message sizes to be received) is larger + * than the size of collective buffer, write_buf, if writes from multiple + * remote processes overlap. Receiving messages into overlapped regions of + * the same write_buffer may cause a problem. To avoid it, we allocate a + * temporary buffer big enough to receive all messages into disjointed + * regions. Earlier in LUSTRE_Exch_and_write(), write_buf is already + * allocated with twice amount of the file stripe size, with the second + * half to be used to receive messages. If sum_recv is smaller than file + * stripe size, we can reuse that space. But if sum_recv is bigger (an + * overlap case, which is rare), we allocate a separate buffer of size + * sum_recv. + */ + sum_recv -= recv_size[myrank]; + if (sum_recv > fd->hints->striping_unit) + *recv_buf = (char *) NCI_Realloc(*recv_buf, sum_recv); + contig_buf = *recv_buf; + + /* cache displacement-length pairs of receive buffer */ + buf_ptr = contig_buf; + for (i = 0; i < nprocs; i++) { + if (recv_size[i] == 0) + continue; + if (i != myrank) { + if (recv_count[i] > 1) { + CACHE_REQ(recv_list[i], recv_size[i], buf_ptr) + buf_ptr += recv_size[i]; + } else { + /* recv_count[i] is the number of noncontiguous offset-length + * pairs describing the write requests of rank i that fall + * into this aggregator's file domain. When recv_count[i] is 1, + * there is only one such pair, meaning the receive message is + * to be stored contiguously. Such message can be received + * directly into write_buf. + */ + CACHE_REQ(recv_list[i], recv_size[i], + write_buf + others_req[i].mem_ptrs[start_pos[i]]) + } + } else if (buf_view->is_contig && recv_count[i] > 0) { + /* send/recv to/from self uses memcpy(). The case when buftype is + * not contiguous will be handled later in Exchange_data_send(). + */ + char *fromBuf = (char *) buf + buf_idx[fd->my_cb_nodes_index]; + MEMCPY_UNPACK(i, fromBuf, start_pos[i], recv_count[i], write_buf); + } + } + return NC_NOERR; +} + +static +void Exchange_data_send( + PNCIO_File *fd, + const void *buf, /* user buffer */ + char *write_buf, /* OUT: internal buffer used to write + * to file, only matter when send to + * self */ + char **send_buf_ptr, /* OUT: [cb_nodes] point to internal + * send buffer */ + PNCIO_View *buf_view, /* IN/OUT: flattened buffer + * offset-length pairs */ + const MPI_Count *send_size, /* [cb_nodes] send_size[i] is amount of + * this rank sent to aggregator i */ + MPI_Count self_count, /* No. offset-length pairs sent to self + * rank */ + MPI_Count start_pos, /* others_req[myrank].curr */ + const PNCIO_Access *others_req, /* [nprocs] only used when send to self, + * others_req[myrank] */ + const MPI_Offset *buf_idx, /* [cb_nodes] indices to user buffer + * for sending this rank's write data + * to aggregator i */ + disp_len_list *send_list) /* OUT: displacement-length pairs of + * send buffer */ +{ + int i, myrank, cb_nodes; + + *send_buf_ptr = NULL; + + MPI_Comm_rank(fd->comm, &myrank); + + cb_nodes = fd->hints->cb_nodes; +// if (myrank==0) printf("%s at %d: cb_nodes=%d\n",__func__,__LINE__, cb_nodes); + if (buf_view->is_contig) { + /* If buftype is contiguous, data can be directly sent from user buf + * at location given by buf_idx. + */ + for (i = 0; i < cb_nodes; i++) { +// if (myrank==0 && send_size[i]) printf("%s at %d: cb_nodes=%d send_size[%d]=%lld my_cb_nodes_index=%d\n",__func__,__LINE__, cb_nodes,i,send_size[i],fd->my_cb_nodes_index); + if (send_size[i] && i != fd->my_cb_nodes_index) + CACHE_REQ(send_list[i], send_size[i], (char*)buf + buf_idx[i]); + } + } else { + char **send_buf, *self_buf; + + /* total send size of this round */ + size_t send_total_size = 0; + for (i = 0; i < cb_nodes; i++) + send_total_size += send_size[i]; + + if (send_total_size == 0) return; + + /* The user buffer to be used to send in this round is not contiguous, + * allocate send_buf[], a contiguous space, copy data to send_buf, + * including ones to be sent to self, and then use send_buf to send. + */ + send_buf = (char **) NCI_Malloc(cb_nodes * sizeof(char *)); + send_buf[0] = (char *) NCI_Malloc(send_total_size); + for (i = 1; i < cb_nodes; i++) + send_buf[i] = send_buf[i - 1] + send_size[i - 1]; + + LUSTRE_Fill_send_buffer(fd, buf, buf_view, send_buf, + send_total_size, send_size, &self_buf, + send_list); + /* Send buffers must not be touched before MPI_Waitall() is completed, + * and thus send_buf will be freed in LUSTRE_Exch_and_write() + */ + + if (fd->my_cb_nodes_index >= 0 && send_size[fd->my_cb_nodes_index] > 0) { + /* contents of user buf that must be sent to self has been copied + * into send_buf[fd->my_cb_nodes_index]. Now unpack it into + * write_buf. + */ + if (self_buf == NULL) self_buf = send_buf[fd->my_cb_nodes_index]; + MEMCPY_UNPACK(myrank, self_buf, start_pos, self_count, write_buf); + } + + *send_buf_ptr = send_buf[0]; + NCI_Free(send_buf); + } +} + +static void LUSTRE_Fill_send_buffer(PNCIO_File *fd, + const void *buf, + PNCIO_View *buf_view, /* IN/OUT */ + char **send_buf, + size_t send_total_size, + const MPI_Count *send_size, + char **self_buf, + disp_len_list *send_list) +{ + /* this function is only called if buftype is not contiguous */ + int q, first_q=-1, isUserBuf=0; + MPI_Count send_size_rem=0, size, copy_size=0; + char *user_buf_ptr=NULL, *send_buf_ptr=NULL, *same_buf_ptr=NULL; + MPI_Offset off, user_buf_idx; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset len, rem_len; +#else + int len, rem_len; +#endif + +#ifdef WKL_DEBUG +int num_memcpy=0; +#endif + + *self_buf = NULL; + + /* user_buf_idx is to the index offset to buf, indicating the starting + * location to be copied. + * + * buf_view stores the offset-length pairs of the flattened user buffer + * data type. Note this stores offset-length pairs of the data type, + * and write amount can be a multiple of the data type. + * buf_view.count: the number of pairs + * buf_view.off[i]: the ith pair's byte offset to buf. Note the + * flattened offsets of user buffer type may not be sorted in an + * increasing order, unlike fileview which is required by MPI to be + * sorted in a monotonically non-decreasing order. + * buf_view.len[i]: length of the ith pair + * buf_view.idx: index to the offset-length pair currently being + * processed, incremented each round. + * buf_view.rem: amount of data in the pair that has not been copied + * over, changed each round. + */ + user_buf_idx = buf_view->off[buf_view->idx] + + buf_view->len[buf_view->idx] + - buf_view->rem; + /* in case data left to be copied from previous round */ + + /* fd->flat_file.count has been checked and adjusted to a possitive number + * at the beginning of PNCIO_LUSTRE_WriteStridedColl(). + */ + assert(fd->flat_file.count > 0); + + /* fd->flat_file.count: the number of noncontiguous file segments this + * rank writes to. Each segment i is described by fd->flat_file.offs[i] + * and fd->flat_file.len[i]. + * fd->flat_file.idx: the index to the fd->flat_file.offs[], + * fd->flat_file.len[] that have been processed in the previous round. + * The while loop below packs write data into send buffers, send_buf[], + * based on this rank's off-len pairs in its file view, + */ + off = fd->flat_file.off[fd->flat_file.idx] + + fd->flat_file.len[fd->flat_file.idx] + - fd->flat_file.rem; + rem_len = fd->flat_file.rem; + +// int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); + while (send_total_size > 0) { + /* this off-len request may span to more than one I/O aggregator */ +// if (rank == 0) printf("rank 0 %s at %d send_total_size=%zd rem_len=%lld\n",__func__,__LINE__,send_total_size,rem_len); + while (rem_len != 0) { + len = rem_len; + q = LUSTRE_Calc_aggregator(fd, off, &len); + /* NOTE: len will be modified by PNCIO_Calc_aggregator() to be no + * more than a file stripe unit size that aggregator "q" is + * responsible for. Note q is not the MPI rank ID, It is the array + * index to fd->hints->ranklist[]. + * + * Now len is the amount of data in ith off-len pair that should be + * sent to aggregator q. Note q can also be self. In this case, + * data is also packed into send_buf[q] or pointed to a segment of + * buf when the data to be packed is contiguous. send_buf[q] will + * later be copied to write buffer in MEMCPY_UNPACK, instead of + * calling MPI_Issend to send. + * + * send_size[q]: data amount of this rank needs to send to + * aggregator q in this round. + * + * len and send_size[q] are all always <= striping_unit + */ + +// if (rank == 0) printf("rank 0 %s at %d rem_len=%lld len=%lld first_q=%d q=%d idx=%lld\n",__func__,__LINE__,rem_len,len,first_q,q,buf_view->idx); + + if (first_q != q) { + assert(send_size_rem == 0); + first_q = q; + isUserBuf = 1; + send_size_rem = send_size[q]; + copy_size = 0; + same_buf_ptr = (char*)buf + user_buf_idx; /* no increment */ + user_buf_ptr = same_buf_ptr; /* increment after each memcpy */ + if (send_buf != NULL) + send_buf_ptr = send_buf[q]; /* increment after each memcpy */ + } + + /* copy len amount of data from buf to send_buf[q] */ + size = len; + + while (size) { + MPI_Count size_in_buf = MIN(size, buf_view->rem); + copy_size += size_in_buf; + user_buf_idx += size_in_buf; + send_size_rem -= size_in_buf; + buf_view->rem -= size_in_buf; +// if (rank == 0) printf("rank 0 %s at %d size=%lld size_in_buf=%lld copy_size=%lld rem=%ld\n",__func__,__LINE__, size, size_in_buf, copy_size,buf_view->rem); + if (buf_view->rem == 0) { /* move on to next off-len pair */ + if (! buf_view->is_contig) { + /* user buffer type is not contiguous */ + if (send_size_rem) { + /* after this copy send_buf[q] is still not full */ + isUserBuf = 0; +// if (rank == 0 && (char*)buf == (char*)user_buf_ptr) printf("rank 0 copy original buf 1 size=%lld user_buf_ptr=%p\n",copy_size,user_buf_ptr); + memcpy(send_buf_ptr, user_buf_ptr, copy_size); +user_buf_ptr += copy_size; + send_buf_ptr += copy_size; + copy_size = 0; + } else if (isUserBuf == 0) { + /* send_buf[q] is full and not using user buf, + * copy the remaining delayed data */ +// if (rank == 0 && (char*)buf == (char*)user_buf_ptr) printf("rank 0 copy original buf 2 size=%lld\n",copy_size); + memcpy(send_buf_ptr, user_buf_ptr, copy_size); +user_buf_ptr += copy_size; + } +#ifdef WKL_DEBUG +num_memcpy++; +#endif + } + /* update buf_view->idx, buf_view->rem, + * and user_buf_idx + */ + buf_view->idx++; +assert(buf_view->idx <= buf_view->count); + +if (buf_view->idx < buf_view->count) { + user_buf_idx = buf_view->off[buf_view->idx]; + buf_view->rem = buf_view->len[buf_view->idx]; + user_buf_ptr = (char*) buf + user_buf_idx; +} +else assert(size - size_in_buf == 0); + + } + else if (send_size_rem == 0 && isUserBuf == 0) { + /* buf_view->rem > 0, send_buf[q] is full, and not using + * user buf to send, copy the remaining delayed data + */ +// if (rank == 0 && (char*)buf == (char*)user_buf_ptr) printf("rank 0 copy original buf 3 size=%lld\n",copy_size); + memcpy(send_buf_ptr, user_buf_ptr, copy_size); +#ifdef WKL_DEBUG +num_memcpy++; +#endif + user_buf_ptr += copy_size; + } + size -= size_in_buf; + } + + if (send_size_rem == 0) { /* data to q is fully packed */ + first_q = -1; + + if (q != fd->my_cb_nodes_index) { /* send only if not self rank */ + if (isUserBuf) + CACHE_REQ(send_list[q], send_size[q], same_buf_ptr) + else + CACHE_REQ(send_list[q], send_size[q], send_buf[q]) + } + else if (isUserBuf) { + /* send buffer is also (part of) user's buf. Return the + * buffer pointer, so the self send data can be directly + * unpack from user buf to write buffer. + */ + *self_buf = same_buf_ptr; + } + } + /* len is the amount of data copied */ + off += len; + rem_len -= len; + fd->flat_file.rem -= len; + send_total_size -= len; + if (send_total_size == 0) break; + } + if (send_total_size == 0) break; + + /* done with this off-len pair, move on to the next */ + if (fd->flat_file.rem == 0) { + fd->flat_file.idx++; + fd->flat_file.rem = fd->flat_file.len[fd->flat_file.idx]; + } + off = fd->flat_file.off[fd->flat_file.idx]; + rem_len = fd->flat_file.rem; + } + +#ifdef WKL_DEBUG +if (num_memcpy> 0) printf("---- fd->flat_file.count=%lld fd->flat_file.idx=%lld buf_view->count=%lld num_memcpy=%d\n",fd->flat_file.count,fd->flat_file.idx,buf_view->count,num_memcpy); +#endif +} + diff --git a/src/drivers/pncio/pncio_lustre_wrstr.c b/src/drivers/pncio/pncio_lustre_wrstr.c new file mode 100644 index 0000000000..fbd3c75d12 --- /dev/null +++ b/src/drivers/pncio/pncio_lustre_wrstr.c @@ -0,0 +1,363 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + + +#define BUFFERED_WRITE { \ + if (req_off >= writebuf_off + writebuf_len) { \ + if (writebuf_len) { \ + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, \ + writebuf_off); \ + if (!fd->atomicity && fd->hints->romio_ds_write == PNCIO_HINT_DISABLE) \ + PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ + if (w_len < 0) { \ + NCI_Free(writebuf); \ + return w_len; \ + } \ + total_w_len += w_len; \ + writebuf_off = req_off; \ + } \ + writebuf_off = req_off; \ + /* stripe_size alignment */ \ + writebuf_len = MIN(end_offset - writebuf_off + 1, \ + (writebuf_off / stripe_size + 1) * stripe_size \ + - writebuf_off); \ + if (!fd->atomicity && fd->hints->romio_ds_write == PNCIO_HINT_DISABLE) \ + PNCIO_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ + r_len = PNCIO_ReadContig(fd, writebuf, writebuf_len, writebuf_off); \ + if (r_len < 0) { \ + NCI_Free(writebuf); \ + return r_len; \ + } \ + } \ + write_sz = (MIN(req_len, writebuf_off + writebuf_len - req_off)); \ + memcpy(writebuf + req_off - writebuf_off, (char *)buf + userbuf_off, \ + write_sz); \ + while (write_sz != req_len) { \ + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off); \ + if (!fd->atomicity && fd->hints->romio_ds_write == PNCIO_HINT_DISABLE) \ + PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ + if (w_len < 0) { \ + NCI_Free(writebuf); \ + return w_len; \ + } \ + total_w_len += w_len; \ + req_len -= write_sz; \ + userbuf_off += write_sz; \ + writebuf_off += writebuf_len; \ + /* stripe_size alignment */ \ + writebuf_len = MIN(end_offset - writebuf_off + 1, \ + (writebuf_off / stripe_size + 1) * stripe_size \ + - writebuf_off); \ + if (!fd->atomicity && fd->hints->romio_ds_write == PNCIO_HINT_DISABLE) \ + PNCIO_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ + r_len = PNCIO_ReadContig(fd, writebuf, writebuf_len, writebuf_off); \ + if (r_len < 0) { \ + NCI_Free(writebuf); \ + return r_len; \ + } \ + write_sz = MIN(req_len, writebuf_len); \ + memcpy(writebuf, (char *)buf + userbuf_off, write_sz); \ + } \ +} + +/* this macro is used when filetype is contig and buftype is not contig. + * it does not do a read-modify-write and does not lock + */ +#define BUFFERED_WRITE_WITHOUT_READ { \ + if (req_off >= writebuf_off + writebuf_len) { \ + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off); \ + if (w_len < 0) { \ + NCI_Free(writebuf); \ + return w_len; \ + } \ + total_w_len += w_len; \ + writebuf_off = req_off; \ + /* stripe_size alignment */ \ + writebuf_len = MIN(end_offset - writebuf_off + 1, \ + (writebuf_off / stripe_size + 1) * stripe_size \ + - writebuf_off); \ + } \ + write_sz = MIN(req_len, writebuf_off + writebuf_len - req_off); \ + memcpy(writebuf + req_off - writebuf_off, \ + (char *)buf + userbuf_off, write_sz); \ + while (write_sz != req_len) { \ + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off); \ + if (w_len < 0) { \ + NCI_Free(writebuf); \ + return w_len; \ + } \ + total_w_len += w_len; \ + req_len -= write_sz; \ + userbuf_off += write_sz; \ + writebuf_off += writebuf_len; \ + /* stripe_size alignment */ \ + writebuf_len = MIN(end_offset - writebuf_off + 1, \ + (writebuf_off / stripe_size + 1) * stripe_size \ + - writebuf_off); \ + write_sz = MIN(req_len, writebuf_len); \ + memcpy(writebuf, (char *)buf + userbuf_off, write_sz); \ + } \ +} + +MPI_Offset PNCIO_LUSTRE_WriteStrided(PNCIO_File *fd, + const void *buf, + PNCIO_View buf_view, + MPI_Offset offset) +{ + char *writebuf; + int i, j, k, st_index=0, stripe_size; + /* offset is in units of etype relative to the filetype. */ + MPI_Offset i_offset, sum, num, size, abs_off_in_filetype=0, off, disp; + MPI_Offset userbuf_off, req_off, end_offset=0, writebuf_off, start_off; + MPI_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size; + MPI_Offset req_len, r_len, w_len, total_w_len=0; + MPI_Count bufsize, writebuf_len, write_sz; + + /* The case of both buftype and filetype being contiguous has gone to + * PNCIO_WriteContig(). + */ + +// printf("%s at %d:\n",__func__,__LINE__); + + if (fd->hints->romio_ds_write == PNCIO_HINT_DISABLE) { + /* if user has disabled data sieving on writes, use naive + * approach instead. + */ + return PNCIO_GEN_WriteStrided_naive(fd, buf, buf_view, offset); + } + + +/* PnetCDF always sets these 3 conditions */ +assert(fd->filetype == MPI_BYTE); +assert(fd->flat_file.size == buf_view.size); +if (fd->flat_file.count > 0) assert(offset == 0); /* not whole file visible */ + + bufsize = buf_view.size; + + /* get striping info */ + stripe_size = fd->hints->striping_unit; + + if (!buf_view.is_contig && fd->flat_file.is_contig) { + /* noncontiguous in write buffer, contiguous in file. */ + + off = fd->disp + offset; + if (fd->flat_file.count > 0) off += fd->flat_file.off[0]; + + start_off = off; + end_offset = start_off + bufsize - 1; + + /* write stripe size buffer each time */ + writebuf = (char *) NCI_Malloc(MIN(bufsize, stripe_size)); + writebuf_off = 0; + writebuf_len = 0; + + /* if atomicity is true or data sieving is not disable, lock the region + * to be accessed + */ + if (fd->atomicity || fd->hints->romio_ds_write != PNCIO_HINT_DISABLE) + PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize); + + for (i = 0; i < buf_view.count; i++) { + userbuf_off = buf_view.off[i]; + req_off = off; + req_len = buf_view.len[i]; + BUFFERED_WRITE_WITHOUT_READ; + off += buf_view.len[i]; + } + + /* write the buffer out the last round */ + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off); + + if (fd->atomicity || fd->hints->romio_ds_write != PNCIO_HINT_DISABLE) + PNCIO_UNLOCK(fd, start_off, SEEK_SET, bufsize); + + NCI_Free(writebuf); + + if (w_len < 0) return w_len; + total_w_len += w_len; + + } else { /* contiguous buffer and non-contiguous in file */ + disp = fd->disp; +/* for non-contiguous in file, PnetCDF always uses disp == 0 */ +assert(disp == 0); + + /* find the starting index in fd->flat_file offset-length pairs */ + sum = 0; + for (i = 0; i < fd->flat_file.count; i++) { + sum += fd->flat_file.len[i]; + if (sum > offset) { + st_index = i; + fwr_size = sum - offset; + abs_off_in_filetype = fd->flat_file.off[i] + + offset - (sum - fd->flat_file.len[i]); + break; + } + } + + /* abs. offset in bytes in the file */ + offset = disp + abs_off_in_filetype; + + start_off = offset; + + /* Write request is within single flat_file contig block. This could + * happen, for example, with subarray types that are actually fairly + * contiguous. + */ + if (buf_view.is_contig && bufsize <= fwr_size) { + req_off = start_off; + req_len = bufsize; + end_offset = start_off + bufsize - 1; + writebuf = (char *) NCI_Malloc(MIN(bufsize, stripe_size)); + memset(writebuf, -1, (size_t)MIN(bufsize, stripe_size)); + writebuf_off = 0; + writebuf_len = 0; + userbuf_off = 0; + BUFFERED_WRITE_WITHOUT_READ; + + /* write the buffer out the last round */ + if (fd->hints->romio_ds_write != PNCIO_HINT_DISABLE) + PNCIO_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); + + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off); + if (w_len > 0) total_w_len += w_len; + + if (fd->hints->romio_ds_write != PNCIO_HINT_DISABLE) + PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); + + NCI_Free(writebuf); + + return total_w_len; + } + + /* Calculate end_offset, the last byte-offset that will be accessed. + * e.g., if start_offset=0 and 100 bytes to be write, end_offset=99 */ + + st_fwr_size = fwr_size; + j = st_index; + i_offset = fwr_size = MIN(st_fwr_size, bufsize); + end_offset = offset + fwr_size - 1; + while (i_offset < bufsize) { + j++; +assert(j < fd->flat_file.count); + off = disp + fd->flat_file.off[j]; + fwr_size = MIN(fd->flat_file.len[j], bufsize - i_offset); + i_offset += fwr_size; + end_offset = off + fwr_size - 1; + } + + /* if atomicity is true or data sieving is not disable, lock the region + * to be accessed */ + if (fd->atomicity || fd->hints->romio_ds_write != PNCIO_HINT_DISABLE) + PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + + writebuf_off = 0; + writebuf_len = 0; + writebuf = (char *) NCI_Malloc(stripe_size); + memset(writebuf, -1, stripe_size); + + if (buf_view.is_contig && !fd->flat_file.is_contig) { + /* contiguous in memory, noncontiguous in file should be the most + * common case. + */ + i_offset = 0; + j = st_index; + off = offset; + fwr_size = MIN(st_fwr_size, bufsize); + while (i_offset < bufsize) { + if (fwr_size) { + req_off = off; + req_len = fwr_size; + userbuf_off = i_offset; + BUFFERED_WRITE; + } + i_offset += fwr_size; + if (i_offset >= bufsize) break; + + if (off + fwr_size < disp + fd->flat_file.off[j] + + fd->flat_file.len[j]) + off += fwr_size; + /* no more I/O needed. off is incremented by fwr_size. */ + else { + j++; +assert(j < fd->flat_file.count); + off = disp + fd->flat_file.off[j]; + fwr_size = MIN(fd->flat_file.len[j], + bufsize - i_offset); + } + } + } else { + /* noncontiguous in memory as well as in file */ + k = num = 0; + i_offset = buf_view.off[0]; + j = st_index; + off = offset; + fwr_size = st_fwr_size; + bwr_size = buf_view.len[0]; + + while (num < bufsize) { + size = MIN(fwr_size, bwr_size); + if (size) { + req_off = off; + req_len = size; + userbuf_off = i_offset; + BUFFERED_WRITE; + } + num += size; + if (num >= bufsize) break; + + new_fwr_size = fwr_size; + new_bwr_size = bwr_size; + + if (size == fwr_size) { + /* reached end of contiguous block in file */ + j++; +assert(j < fd->flat_file.count); + off = disp + fd->flat_file.off[j]; + + new_fwr_size = fd->flat_file.len[j]; + if (size != bwr_size) { + i_offset += size; + new_bwr_size -= size; + } + } + + if (size == bwr_size) { + /* reached end of contiguous block in memory */ + k++; +assert(k < buf_view.count); + i_offset = buf_view.off[k]; + new_bwr_size = buf_view.len[k]; + if (size != fwr_size) { + off += size; + new_fwr_size -= size; + } + } + fwr_size = new_fwr_size; + bwr_size = new_bwr_size; + } + } + + /* write the buffer out the last round */ + if (writebuf_len) { + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off); + if (!fd->atomicity && fd->hints->romio_ds_write == PNCIO_HINT_DISABLE) + PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); + if (w_len < 0) return w_len; + total_w_len += w_len; + } + if (fd->atomicity || fd->hints->romio_ds_write != PNCIO_HINT_DISABLE) + PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); + + NCI_Free(writebuf); + } + + return buf_view.size; +} diff --git a/src/drivers/pncio/pncio_open.c b/src/drivers/pncio/pncio_open.c new file mode 100644 index 0000000000..c7ec8539b5 --- /dev/null +++ b/src/drivers/pncio/pncio_open.c @@ -0,0 +1,372 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include /* open(), O_CREAT */ +#include /* open(), umask(), fstat() */ + +#if defined(HAVE_SYS_STAT_H) && HAVE_SYS_STAT_H == 1 +#include /* fstat() */ +#endif +#include /* fstat() */ + +#include +#include + +#include + +#include "pncio.h" + +/*----< GEN_set_cb_node_list() >---------------------------------------------*/ +/* Construct the list of I/O aggregators. It sets the followings. + * fd->hints->ranklist[]. + * fd->hints->cb_nodes and set file info for hint cb_nodes. + * fd->is_agg: indicating whether this rank is an I/O aggregator + * fd->my_cb_nodes_index: index into fd->hints->ranklist[]. -1 if N/A + */ +static +int GEN_set_cb_node_list(PNCIO_File *fd) +{ + int i, j, k, nprocs, rank, *nprocs_per_node, **ranks_per_node; + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &rank); + + if (fd->hints->cb_nodes == 0) + /* If hint cb_nodes is not set by user, select one rank per node to be + * an I/O aggregator + */ + fd->hints->cb_nodes = fd->node_ids.num_nodes; + else if (fd->hints->cb_nodes > nprocs) + /* cb_nodes must be <= nprocs */ + fd->hints->cb_nodes = nprocs; + + fd->hints->ranklist = (int *) NCI_Malloc(sizeof(int) * fd->hints->cb_nodes); + if (fd->hints->ranklist == NULL) + return NC_ENOMEM; + + /* number of MPI processes running on each node */ + nprocs_per_node = (int *) NCI_Calloc(fd->node_ids.num_nodes, sizeof(int)); + + for (i=0; inode_ids.ids[i]]++; + + /* construct rank IDs of MPI processes running on each node */ + ranks_per_node = (int **) NCI_Malloc(sizeof(int*) * fd->node_ids.num_nodes); + ranks_per_node[0] = (int *) NCI_Malloc(sizeof(int) * nprocs); + for (i=1; inode_ids.num_nodes; i++) + ranks_per_node[i] = ranks_per_node[i - 1] + nprocs_per_node[i - 1]; + + for (i=0; inode_ids.num_nodes; i++) nprocs_per_node[i] = 0; + + /* Populate ranks_per_node[], list of MPI ranks running on each node. + * Populate nprocs_per_node[], number of MPI processes on each node. + */ + for (i=0; inode_ids.ids[i]; + ranks_per_node[k][nprocs_per_node[k]] = i; + nprocs_per_node[k]++; + } + + /* select process ranks from nodes in a round-robin fashion to be I/O + * aggregators + */ + k = j = 0; + for (i=0; ihints->cb_nodes; i++) { + if (j >= nprocs_per_node[k]) { /* if run out of ranks in this node k */ + k++; + if (k == fd->node_ids.num_nodes) { /* round-robin to first node */ + k = 0; + j++; + } + } + /* select jth rank of node k as an I/O aggregator */ + fd->hints->ranklist[i] = ranks_per_node[k++][j]; + if (rank == fd->hints->ranklist[i]) { + fd->is_agg = 1; + fd->my_cb_nodes_index = i; + } + if (k == fd->node_ids.num_nodes) { /* round-robin to first node */ + k = 0; + j++; + } + } + NCI_Free(ranks_per_node[0]); + NCI_Free(ranks_per_node); + NCI_Free(nprocs_per_node); + + return 0; +} + +/*----< GEN_create() >-------------------------------------------------------*/ +/* 1. root creates the file + * 2. root sets and obtains striping info + * 3. root broadcasts striping info + * 4. non-root processes receive striping info from root + * 5. non-root processes opens the fie + */ +static int +GEN_create(PNCIO_File *fd, + int mpi_io_mode) +{ + int err=NC_NOERR, rank, amode, perm, old_mask; + int stripin_info[4] = {-1, -1, -1, -1}; + + MPI_Comm_rank(fd->comm, &rank); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) +int world_rank; MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); +if (world_rank == 0) { printf("\nxxxx %s at %d: ---- %s\n",__func__,__LINE__,fd->filename); fflush(stdout);} +#endif + + amode = O_CREAT; + if (mpi_io_mode & MPI_MODE_RDWR) amode |= O_RDWR; + + old_mask = umask(022); + umask(old_mask); + perm = old_mask ^ PNCIO_PERM; + + /* root process creates the file first, followed by all processes open the + * file. + */ + if (rank > 0) goto err_out; + + fd->fd_sys = open(fd->filename, amode, perm); + if (fd->fd_sys == -1) { + fprintf(stderr,"%s line %d: rank %d fails to create file %s (%s)\n", + __func__,__LINE__, rank, fd->filename, strerror(errno)); + err = ncmpii_error_posix2nc("open"); + goto err_out; + } + fd->is_open = 1; + stripin_info[0] = 1048576; /* default to 1 MiB */ + + /* Only root obtains the striping information and bcast to all other + * processes. For UFS, file striping is the file system block size. + */ +#if defined(HAVE_SYS_STAT_H) && HAVE_SYS_STAT_H == 1 + struct stat statbuf; + err = fstat(fd->fd_sys, &statbuf); + if (err >= 0) + /* file system block size usually < MAX_INT */ + stripin_info[0] = (int)statbuf.st_blksize; +#endif + +err_out: + MPI_Bcast(stripin_info, 4, MPI_INT, 0, fd->comm); + + fd->hints->striping_unit = stripin_info[0]; + fd->hints->striping_factor = stripin_info[1]; + fd->hints->start_iodevice = stripin_info[2]; + + if (rank > 0) { /* non-root processes */ + fd->fd_sys = open(fd->filename, O_RDWR, perm); + if (fd->fd_sys == -1) { + fprintf(stderr,"%s line %d: rank %d failure to open file %s (%s)\n", + __func__,__LINE__, rank, fd->filename, strerror(errno)); + return ncmpii_error_posix2nc("ioctl"); + } + fd->is_open = 1; + } + + /* construct cb_nodes rank list */ + GEN_set_cb_node_list(fd); + MPI_Info_set(fd->info, "romio_filesystem_type", "UFS:"); + + return err; +} + +/*----< GEN_open() >---------------------------------------------------------*/ +/* 1. all processes open the file. + * 2. root obtains striping info and broadcasts to all others + */ +static int +GEN_open(PNCIO_File *fd) +{ + int err=NC_NOERR, rank, perm, old_mask, omode; + int stripin_info[4] = {1048576, -1, -1, -1}; + + MPI_Comm_rank(fd->comm, &rank); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) +int world_rank; MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); +if (world_rank == 0) { printf("\nxxxx %s at %d: ---- %s\n",__func__,__LINE__,fd->filename); fflush(stdout);} +#endif + + old_mask = umask(022); + umask(old_mask); + perm = old_mask ^ PNCIO_PERM; + + if (fIsSet(fd->access_mode, MPI_MODE_RDWR)) + omode = O_RDWR; + else + omode = O_RDONLY; + + /* All processes open the file. */ + fd->fd_sys = open(fd->filename, omode, perm); + if (fd->fd_sys == -1) { + fprintf(stderr, "%s line %d: rank %d failure to open file %s (%s)\n", + __func__,__LINE__, rank, fd->filename, strerror(errno)); + err = ncmpii_error_posix2nc("open"); + goto err_out; + } + fd->is_open = 1; + stripin_info[0] = 1048576; /* default to 1 MiB */ + + /* Only root obtains the striping information and bcast to all other + * processes. For UFS, file striping is the file system block size. + */ +#if defined(HAVE_SYS_STAT_H) && HAVE_SYS_STAT_H == 1 + if (rank == 0) { + /* Get the underlying file system block size as file striping_unit */ + struct stat statbuf; + err = fstat(fd->fd_sys, &statbuf); + if (err >= 0) + /* file system block size usually < MAX_INT */ + stripin_info[0] = (int)statbuf.st_blksize; + } +#endif + +err_out: + MPI_Bcast(stripin_info, 4, MPI_INT, 0, fd->comm); + fd->hints->striping_unit = stripin_info[0]; + fd->hints->striping_factor = stripin_info[1]; + fd->hints->start_iodevice = stripin_info[2]; + + /* construct cb_nodes rank list */ + GEN_set_cb_node_list(fd); + MPI_Info_set(fd->info, "romio_filesystem_type", "UFS:"); + + return err; +} + +/*----< PNCIO_File_open() >---------------------------------------------------*/ +int PNCIO_File_open(MPI_Comm comm, + const char *filename, + int amode, + MPI_Info info, + PNCIO_File *fd) +{ + /* Before reaching to this subroutine, PNCIO_FileSysType() should have been + * called to check the file system type. + */ + char value[MPI_MAX_INFO_VAL + 1], int_str[16]; + int i, err, min_err, status=NC_NOERR; + + fd->comm = comm; + fd->filename = filename; /* without file system type name prefix */ + fd->atomicity = 0; + fd->filetype = MPI_BYTE; + fd->is_open = 0; + fd->access_mode = amode; + fd->io_buf = NULL; /* collective buffer used by aggregators only */ + + fd->flat_file.count = 0; /* flattend fileview in offset-length pairs */ + fd->flat_file.size = -1; + fd->flat_file.is_contig = 1; + fd->flat_file.off = NULL; + fd->flat_file.len = NULL; + + /* create and initialize info object */ + fd->hints = (PNCIO_Hints*) NCI_Calloc(1, sizeof(PNCIO_Hints)); + status = PNCIO_File_SetInfo(fd, info); + if (status != NC_NOERR && status != NC_EMULTIDEFINE_HINTS) { + /* Inconsistent I/O hints is not a fatal error. + * In PNCIO_File_SetInfo(), root's hints overwrite local's. + */ + goto err_out; + } + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + for (i=0; iwrite_timing[i] = fd->read_timing[i] = 0; + fd->write_counter[i] = fd->read_counter[i] = 0; + } +#endif + + assert(fd->file_system != PNCIO_FSTYPE_MPIIO); + + /* TODO: When hint romio_no_indep_rw hint is set to true, only aggregators open + * the file. + * Note because fd->is_agg is set at the end of create/open call. + */ + if (fd->file_system == PNCIO_LUSTRE) { + if (amode & MPI_MODE_CREATE) + err = PNCIO_Lustre_create(fd, amode); + else + err = PNCIO_Lustre_open(fd); + } + else { + if (amode & MPI_MODE_CREATE) + err = GEN_create(fd, amode); + else + err = GEN_open(fd); + } + if (err != NC_NOERR) { /* fatal error */ + status = err; + goto err_out; + } + + /* set file striping hints */ + snprintf(int_str, 16, "%d", fd->hints->striping_unit); + MPI_Info_set(fd->info, "striping_unit", int_str); + + snprintf(int_str, 16, "%d", fd->hints->striping_factor); + MPI_Info_set(fd->info, "striping_factor", int_str); + + snprintf(int_str, 16, "%d", fd->hints->start_iodevice); + MPI_Info_set(fd->info, "start_iodevice", int_str); + + /* set file striping hints */ + snprintf(int_str, 16, "%d", fd->hints->cb_nodes); + MPI_Info_set(fd->info, "cb_nodes", int_str); + + /* add hint "cb_node_list", list of aggregators' rank IDs */ + snprintf(value, 16, "%d", fd->hints->ranklist[0]); + for (i=1; ihints->cb_nodes; i++) { + snprintf(int_str, 16, " %d", fd->hints->ranklist[i]); + if (strlen(value) + strlen(int_str) >= MPI_MAX_INFO_VAL-5) { + strcat(value, " ..."); + break; + } + strcat(value, int_str); + } + MPI_Info_set(fd->info, "cb_node_list", value); + + /* collective buffer size must be at least file striping size */ + if (fd->hints->cb_buffer_size < fd->hints->striping_unit) { + fd->hints->cb_buffer_size = fd->hints->striping_unit; + snprintf(int_str, 16, " %d", fd->hints->cb_buffer_size); + MPI_Info_set(fd->info, "cb_buffer_size", int_str); + } + + /* collective buffer is used only by I/O aggregators only */ + if (fd->is_agg) { + fd->io_buf = NCI_Calloc(1, fd->hints->cb_buffer_size); + if (fd->io_buf == NULL) /* fatal error */ + status = NC_ENOMEM; + } + +err_out: + MPI_Allreduce(&status, &min_err, 1, MPI_INT, MPI_MIN, comm); + /* All NC errors are < 0 */ + + if (min_err != NC_NOERR) { + if (status == NC_NOERR && fd->is_open) + /* close file if opened successfully */ + close(fd->fd_sys); + NCI_Free(fd->hints); + if (fd->info != MPI_INFO_NULL) + MPI_Info_free(&(fd->info)); + if (fd->io_buf != NULL) + NCI_Free(fd->io_buf); + } + return status; +} + diff --git a/src/drivers/pncio/pncio_read.c b/src/drivers/pncio/pncio_read.c new file mode 100644 index 0000000000..a7594ed2f7 --- /dev/null +++ b/src/drivers/pncio/pncio_read.c @@ -0,0 +1,140 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include /* pread() */ + +#include + +#include "pncio.h" + +/*----< PNCIO_ReadContig() >--------------------------------------------------*/ +MPI_Offset PNCIO_ReadContig(PNCIO_File *fd, + void *buf, + MPI_Offset r_size, + MPI_Offset offset) +{ + ssize_t err = 0; + size_t r_count; + MPI_Offset bytes_xfered = 0; + char *p; + +// printf("%s at %d: %s pread offset=%lld r_size=%lld\n",__func__,__LINE__,fd->filename,offset,r_size); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + double timing = MPI_Wtime(); +#endif + p = (char *) buf; + while (bytes_xfered < r_size) { + r_count = r_size - bytes_xfered; + err = pread(fd->fd_sys, p, r_count, offset + bytes_xfered); + if (err == -1) + goto ioerr; + if (err == 0) + break; + bytes_xfered += err; + p += err; + } +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + fd->read_timing[2] += MPI_Wtime() - timing; +#endif + +ioerr: + if (err == -1) + bytes_xfered = ncmpii_error_posix2nc("pread"); + +/* +if (offset > 0) {unsigned long long wkl[4]; + memcpy(wkl, buf, sizeof(unsigned long long) * 4); + ncmpii_in_swapn(wkl, 4, 8); + printf("%s at %d: %s pread offset=%lld r_size=%lld wkl=%llu %lld %lld %lld\n",__func__,__LINE__,fd->filename,offset,r_size,wkl[0],wkl[1],wkl[2],wkl[3]); +} +*/ + + return bytes_xfered; +} + +/*----< file_read() >--------------------------------------------------------*/ +/* This is an independent call. */ +static +MPI_Offset file_read(PNCIO_File *fd, + MPI_Offset offset, /* relative to fileview */ + void *buf, + PNCIO_View buf_view) +{ + MPI_Offset r_len=0; + +// printf("%s at %d: offset=%lld buf_view size=%lld\n",__func__,__LINE__, offset,buf_view.size); + +assert(fd->filetype == MPI_BYTE); +if (fd->flat_file.count > 0) assert(offset == 0); /* not whole file visible */ + + if (buf_view.size == 0) /* zero-sized request */ + return NC_NOERR; + + if (buf_view.is_contig && fd->flat_file.is_contig) { + if (fd->flat_file.count > 0) offset += fd->flat_file.off[0]; + r_len = PNCIO_ReadContig(fd, buf, buf_view.size, offset); + } + else + r_len = PNCIO_GEN_ReadStrided(fd, buf, buf_view, offset); + + return r_len; +} + +/*----< PNCIO_File_read_at() >------------------------------------------------*/ +/* This is an independent call. + * offset is a position in the file relative to the current view, expressed as + * a count of etypes. + */ +MPI_Offset PNCIO_File_read_at(PNCIO_File *fh, + MPI_Offset offset, + void *buf, + PNCIO_View buf_view) +{ + assert(fh != NULL); + + if (buf_view.size == 0) return NC_NOERR; + + if (buf_view.size < 0) return NC_ENEGATIVECNT; + + /* PnetCDF has only 2 modes: read-only and read-write */ + // if (fh->access_mode & MPI_MODE_RDONLY) return NC_EPERM; + + return file_read(fh, offset, buf, buf_view); +} + +/*----< PNCIO_File_read_at_all() >--------------------------------------------*/ +/* This is a collective call. + * offset is a position in the file relative to the current view, expressed as + * a count of etypes. + */ +MPI_Offset PNCIO_File_read_at_all(PNCIO_File *fh, + MPI_Offset offset, + void *buf, + PNCIO_View buf_view) +{ + int err=NC_NOERR; + MPI_Offset r_len; + + assert(fh != NULL); + + if (buf_view.size < 0) err = NC_ENEGATIVECNT; + + /* PnetCDF has only 2 modes: read-only and read-write */ + // if (fh->access_mode & MPI_MODE_RDONLY && st == NC_NOERR) st = NC_EPERM; + + r_len = PNCIO_GEN_ReadStridedColl(fh, buf, buf_view, offset); + + return (err == NC_NOERR) ? r_len : err; +} + diff --git a/src/drivers/pncio/pncio_read_coll.c b/src/drivers/pncio/pncio_read_coll.c new file mode 100644 index 0000000000..e4e8fb47eb --- /dev/null +++ b/src/drivers/pncio/pncio_read_coll.c @@ -0,0 +1,821 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include /* type bool */ + +#include + +/* prototypes of functions used for collective reads only. */ +static +MPI_Offset Read_and_exch(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, int nprocs, + int myrank, PNCIO_Access *others_req, + MPI_Offset + min_st_offset, MPI_Offset fd_size, + MPI_Offset * fd_start, MPI_Offset * fd_end, + MPI_Aint * buf_idx); + +static void R_Exchange_data(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, + MPI_Count * send_size, MPI_Count * recv_size, + MPI_Count * count, MPI_Count * start_pos, + MPI_Count * partial_send, + MPI_Count * recd_from_proc, int nprocs, + int myrank, + MPI_Offset min_st_offset, + MPI_Offset fd_size, + MPI_Offset * fd_start, MPI_Offset * fd_end, + PNCIO_Access * others_req, + int iter, MPI_Aint * buf_idx, + MPI_Aint * actual_recved_bytes); +static void Fill_user_buffer(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, char **recv_buf, + MPI_Count * recv_size, + MPI_Count * recd_from_proc, int nprocs, + MPI_Offset min_st_offset, + MPI_Offset fd_size, MPI_Offset * fd_start, + MPI_Offset * fd_end); + +MPI_Offset PNCIO_GEN_ReadStridedColl(PNCIO_File *fd, + void *buf, + PNCIO_View buf_view, + MPI_Offset offset) +{ +/* Uses a generalized version of the extended two-phase method described + in "An Extended Two-Phase Method for Accessing Sections of + Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, + Scientific Programming, (5)4:301--317, Winter 1996. + http://www.mcs.anl.gov/home/thakur/ext2ph.ps */ + + PNCIO_Access *my_req; + /* array of nprocs structures, one for each other process in + * whose file domain this process's request lies */ + + PNCIO_Access *others_req; + /* array of nprocs structures, one for each other process + * whose request lies in this process's file domain. */ + + int nprocs, nprocs_for_coll, myrank; + int interleave_count = 0; + MPI_Count *count_my_req_per_proc, count_my_req_procs; + MPI_Count *count_others_req_per_proc, count_others_req_procs; + MPI_Offset start_offset, end_offset, fd_size, min_st_offset; + MPI_Offset *st_offsets = NULL, *fd_start = NULL, + *fd_end = NULL, *end_offsets = NULL; + MPI_Aint *buf_idx = NULL; + MPI_Offset r_len, total_r_len=0; + +// printf("%s at %d: offset %lld buf_view size %lld flat_file.count %lld size %lld is_contig %d\n",__func__,__LINE__,offset,buf_view.size,fd->flat_file.count, fd->flat_file.size, fd->flat_file.is_contig); + +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset one_len = (MPI_Offset)buf_view.size; +#else + int one_len = (int)buf_view.size; +#endif + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) +double curT = MPI_Wtime(); +#endif + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + + /* PnetCDF never reuses a fileview across two or more PNCIO calls. As this + * subroutine may modify the contents of fd->flat_file, we save its + * contents and restore it before leaving this sibroutine. + */ + PNCIO_View saved_flat_file = fd->flat_file; + + if (fd->flat_file.count == 0) { /* whole file is visible */ + /* set flat_file as a single contiguous offset-length pair */ + fd->flat_file.off = &offset; + fd->flat_file.len = &one_len; + fd->flat_file.size = one_len; + fd->flat_file.count = 1; + fd->flat_file.is_contig = 1; + start_offset = offset; + end_offset = offset + buf_view.size - 1; + } + else { + /* When flat_file is not contiguous, PnetCDF always calls this + * subroutine with offset == 0. + */ + assert(offset == 0); + } + + /* number of aggregators, cb_nodes, is stored in the hints */ + nprocs_for_coll = fd->hints->cb_nodes; + + /* only check for interleaving if romio_cb_read isn't disabled */ + if (fd->hints->romio_cb_read != PNCIO_HINT_DISABLE) { + /* For this process's request, calculate the file start and end + * offsets. Note: end_offset points to the last byte-offset that will + * be accessed, e.g., if start_offset=0 and 100 bytes to be read, + * end_offset=99 + * + * Note flat_file.off[] is always relative to beginning of file. + */ + start_offset = fd->flat_file.off[0]; + end_offset = fd->flat_file.off[fd->flat_file.count-1] + + fd->flat_file.len[fd->flat_file.count-1] - 1; + + /* each process communicates its start and end offsets to other + * processes. The result is an array each of start and end offsets + * stored in order of process rank. */ + st_offsets = (MPI_Offset *) NCI_Malloc(nprocs * 2 * sizeof(MPI_Offset)); + end_offsets = st_offsets + nprocs; + + MPI_Allgather(&start_offset, 1, MPI_OFFSET, st_offsets, 1, MPI_OFFSET, + fd->comm); + MPI_Allgather(&end_offset, 1, MPI_OFFSET, end_offsets, 1, MPI_OFFSET, + fd->comm); + + /* Are the accesses of different processes interleaved? Below is a + * rudimentary check for interleaving, but should suffice for the + * moment. */ + for (int i = 1; i < nprocs; i++) + if ((st_offsets[i] < end_offsets[i - 1]) && + (st_offsets[i] <= end_offsets[i])) + interleave_count++; + } + + if (fd->hints->romio_cb_read == PNCIO_HINT_DISABLE + || (!interleave_count && (fd->hints->romio_cb_read == PNCIO_HINT_AUTO))) { + /* switch to independent read */ + + if (st_offsets != NULL) NCI_Free(st_offsets); + + /* restore flattend file view before leaving this sibroutine */ + fd->flat_file = saved_flat_file; + + if (buf_view.size == 0) /* zero-size request */ + return 0; + + if (buf_view.is_contig && fd->flat_file.is_contig) { + /* When fd->flat_file.is_contig, it is still possible + * fd->flat_file.count > 0 and when this happens + * fd->flat_file.count should be 1, which comes from PnetCDF wait + * when the number of nonblocking requests is 1. + */ + if (fd->flat_file.count > 0) offset += fd->flat_file.off[0]; + return PNCIO_ReadContig(fd, buf, buf_view.size, offset); + } + else + return PNCIO_GEN_ReadStrided(fd, buf, buf_view, offset); + } + + /* We're going to perform aggregation of I/O. Here we call + * PNCIO_Calc_file_domains() to determine what processes will handle I/O + * to what regions. We pass nprocs_for_coll into this function; it is + * used to determine how many processes will perform I/O, which is also + * the number of regions into which the range of bytes must be divided. + * These regions are called "file domains", or FDs. + * + * When this function returns, fd_start, fd_end, fd_size, and + * min_st_offset will be filled in. fd_start holds the starting byte + * location for each file domain. fd_end holds the ending byte location. + * min_st_offset holds the minimum byte location that will be accessed. + * + * Both fd_start[] and fd_end[] are indexed by an aggregator number; this + * needs to be mapped to an actual rank in the communicator later. + * + */ + PNCIO_Calc_file_domains(st_offsets, end_offsets, nprocs, nprocs_for_coll, + &min_st_offset, &fd_start, &fd_end, &fd_size, + fd->hints->striping_unit); + + /* calculate where the portions of the access requests of this process + * are located in terms of the file domains. this could be on the same + * process or on other processes. this function fills in: + * count_my_req_procs - number of processes (including this one) for which + * this process has requests in their file domain + * count_my_req_per_proc - count of requests for each process, indexed + * by rank of the process + * my_req[] - array of data structures describing the requests to be + * performed by each process (including self). indexed by rank. + * buf_idx[] - array of locations into which data can be directly moved; + * this is only valid for contiguous buffer case + */ + PNCIO_Calc_my_req(fd, min_st_offset, fd_end, fd_size, nprocs, + &count_my_req_procs, &count_my_req_per_proc, &my_req, + &buf_idx); + + /* perform a collective communication in order to distribute the + * data calculated above. fills in the following: + * count_others_req_procs - number of processes (including this + * one) which have requests in this process's file domain. + * count_others_req_per_proc[] - number of separate contiguous + * requests from proc i lie in this process's file domain. + */ + PNCIO_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, + my_req, nprocs, myrank, &count_others_req_procs, + &count_others_req_per_proc, &others_req); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->read_timing[1] += MPI_Wtime() - curT; +#endif + + /* read data in sizes of no more than collective buffer size, + * communicate, and fill user buf. + */ + r_len = Read_and_exch(fd, buf, buf_view, nprocs, myrank, others_req, + min_st_offset, fd_size, fd_start, fd_end, buf_idx); + if (r_len > 0) total_r_len += r_len; + + /* free all memory allocated for collective I/O */ + PNCIO_Free_my_req(count_my_req_per_proc, my_req, buf_idx); + PNCIO_Free_others_req(count_others_req_per_proc, others_req); + + NCI_Free(st_offsets); + NCI_Free(fd_start); + + /* restore flattend file view before leaving this sibroutine */ + fd->flat_file = saved_flat_file; + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->read_timing[0] += MPI_Wtime() - curT; +#endif + + return (r_len < 0) ? r_len : total_r_len; +} + +static +MPI_Offset Read_and_exch(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, int nprocs, + int myrank, PNCIO_Access *others_req, + MPI_Offset min_st_offset, MPI_Offset fd_size, + MPI_Offset * fd_start, MPI_Offset * fd_end, + MPI_Aint * buf_idx) +{ +/* Read in sizes of no more than coll_bufsize, an info parameter. + Send data to appropriate processes. + Place recd. data in user buf. + The idea is to reduce the amount of extra memory required for + collective I/O. If all data were read all at once, which is much + easier, it would require temp space more than the size of user_buf, + which is often unacceptable. For example, to read a distributed + array from a file, where each local array is 8Mbytes, requiring + at least another 8Mbytes of temp space is unacceptable. */ + + int i, m, ntimes, max_ntimes; + MPI_Offset st_loc = -1, end_loc = -1, off, done, real_off; + char *read_buf = NULL, *tmp_buf; + MPI_Count *curr_offlen_ptr, *count, *send_size, *recv_size; + MPI_Count *partial_send, *recd_from_proc, *start_pos; + /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets */ + MPI_Offset real_size, size, for_curr_iter, for_next_iter; + int rank; + MPI_Aint coll_bufsize; + MPI_Aint actual_recved_bytes = 0; + MPI_Offset r_len; + +/* calculate the number of reads of size coll_bufsize + to be done by each process and the max among all processes. + That gives the no. of communication phases as well. + coll_bufsize is obtained from the hints object. */ + + coll_bufsize = fd->hints->cb_buffer_size; + + /* grab some initial values for st_loc and end_loc */ + for (i = 0; i < nprocs; i++) { + if (others_req[i].count) { + st_loc = others_req[i].offsets[0]; + end_loc = others_req[i].offsets[0]; + break; + } + } + + /* now find the real values */ + for (i = 0; i < nprocs; i++) + for (MPI_Count j = 0; j < others_req[i].count; j++) { + st_loc = MIN(st_loc, others_req[i].offsets[j]); + end_loc = MAX(end_loc, (others_req[i].offsets[j] + + others_req[i].lens[j] - 1)); + } + + /* calculate ntimes, the number of times this process must perform I/O + * operations in order to complete all the requests it has received. + * the need for multiple I/O operations comes from the restriction that + * we only use coll_bufsize bytes of memory for internal buffering. + */ + if ((st_loc == -1) && (end_loc == -1)) { + /* this process does no I/O. */ + ntimes = 0; + } else { + /* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize) */ + ntimes = (int) ((end_loc - st_loc + coll_bufsize) / coll_bufsize); + } + + MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + fd->read_counter[0] = MAX(fd->read_counter[0], max_ntimes); +#endif + + read_buf = fd->io_buf; /* Allocated at open time */ + + curr_offlen_ptr = NCI_Calloc(nprocs * 7, sizeof(*curr_offlen_ptr)); + /* its use is explained below. calloc initializes to 0. */ + + count = curr_offlen_ptr + nprocs; + /* to store count of how many off-len pairs per proc are satisfied + * in an iteration. */ + + partial_send = count + nprocs; + /* if only a portion of the last off-len pair is sent to a process + * in a particular iteration, the length sent is stored here. + * calloc initializes to 0. */ + + send_size = partial_send + nprocs; + /* total size of data to be sent to each proc. in an iteration */ + + recv_size = send_size + nprocs; + /* total size of data to be recd. from each proc. in an iteration. + * Of size nprocs so that I can use MPI_Alltoall later. */ + + recd_from_proc = recv_size + nprocs; + /* amount of data recd. so far from each proc. Used in Fill_user_buffer. + * initialized to 0 here. */ + + start_pos = recd_from_proc + nprocs; + /* used to store the starting value of curr_offlen_ptr[i] in + * this iteration */ + + done = 0; + off = st_loc; + for_curr_iter = for_next_iter = 0; + + MPI_Comm_rank(fd->comm, &rank); + + for (m = 0; m < ntimes; m++) { + /* read buf of size coll_bufsize (or less) */ + /* go through all others_req and check if any are satisfied + * by the current read */ + + /* since MPI guarantees that displacements in filetypes are in + * monotonically nondecreasing order, I can maintain a pointer + * (curr_offlen_ptr) to + * current off-len pair for each process in others_req and scan + * further only from there. There is still a problem of filetypes + * such as: (1, 2, 3 are not process nos. They are just numbers for + * three chunks of data, specified by a filetype.) + * + * 1 -------!-- + * 2 -----!---- + * 3 --!----- + * + * where ! indicates where the current read_size limitation cuts + * through the filetype. I resolve this by reading up to !, but + * filling the communication buffer only for 1. I copy the portion + * left over for 2 into a tmp_buf for use in the next + * iteration. i.e., 2 and 3 will be satisfied in the next + * iteration. This simplifies filling in the user's buf at the + * other end, as only one off-len pair with incomplete data + * will be sent. I also don't need to send the individual + * offsets and lens along with the data, as the data is being + * sent in a particular order. */ + + /* off = start offset in the file for the data actually read in + * this iteration + * size = size of data read corresponding to off + * real_off = off minus whatever data was retained in memory from + * previous iteration for cases like 2, 3 illustrated above + * real_size = size plus the extra corresponding to real_off + * req_off = off in file for a particular contiguous request + * minus what was satisfied in previous iteration + * req_size = size corresponding to req_off */ + + size = MIN(coll_bufsize, end_loc - st_loc + 1 - done); + bool flag = false; + for (i = 0; i < nprocs; i++) { + if (others_req[i].count) { + for (MPI_Count j = curr_offlen_ptr[i]; j < others_req[i].count; j++) { + MPI_Offset req_off; + if (partial_send[i]) { + req_off = others_req[i].offsets[j] + partial_send[i]; + } else { + req_off = others_req[i].offsets[j]; + } + if (req_off < off + size) { + flag = true; + } + } + } + } + if (flag) { + /* This should be only reached by I/O aggregators only */ + r_len = PNCIO_ReadContig(fd, read_buf + for_curr_iter, size, off); + if (r_len < 0) return r_len; + size = r_len; + } + + real_off = off - for_curr_iter; + real_size = size + for_curr_iter; + + for (i = 0; i < nprocs; i++) + count[i] = send_size[i] = 0; + for_next_iter = 0; + + for (i = 0; i < nprocs; i++) { + if (others_req[i].count) { + start_pos[i] = curr_offlen_ptr[i]; + MPI_Count j = 0; + for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) { + MPI_Offset req_off; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset req_len; +#else + int req_len; +#endif + if (partial_send[i]) { + /* this request may have been partially + * satisfied in the previous iteration. */ + req_off = others_req[i].offsets[j] + partial_send[i]; + req_len = others_req[i].lens[j] - partial_send[i]; + partial_send[i] = 0; + /* modify the off-len pair to reflect this change */ + others_req[i].offsets[j] = req_off; + others_req[i].lens[j] = req_len; + } else { + req_off = others_req[i].offsets[j]; + req_len = others_req[i].lens[j]; + } + if (req_off < real_off + real_size) { + count[i]++; + MPI_Aint addr; + MPI_Get_address(read_buf + req_off - real_off, &addr); + others_req[i].mem_ptrs[j] = addr; + send_size[i] += (MIN(real_off + real_size - req_off, req_len)); + + if (real_off + real_size - req_off < req_len) { + partial_send[i] = (real_off + real_size - req_off); + if ((j + 1 < others_req[i].count) && + (others_req[i].offsets[j + 1] < real_off + real_size)) { + /* this is the case illustrated in the + * figure above. */ + for_next_iter = MAX(for_next_iter, + real_off + real_size - + others_req[i].offsets[j + 1]); + /* max because it must cover requests + * from different processes */ + } + break; + } + } else + break; + } + curr_offlen_ptr[i] = j; + } + } + + for_curr_iter = for_next_iter; + + MPI_Aint recved_bytes = 0; + R_Exchange_data(fd, buf, buf_view, send_size, recv_size, count, + start_pos, partial_send, recd_from_proc, nprocs, + myrank, min_st_offset, fd_size, fd_start, fd_end, + others_req, m, buf_idx, &recved_bytes); + actual_recved_bytes += recved_bytes; + + + if (for_next_iter) { + tmp_buf = (char *) NCI_Malloc(for_next_iter); + memcpy(tmp_buf, read_buf + real_size - for_next_iter, for_next_iter); + NCI_Free(fd->io_buf); + fd->io_buf = (char *) NCI_Malloc(for_next_iter + coll_bufsize); + memcpy(fd->io_buf, tmp_buf, for_next_iter); + read_buf = fd->io_buf; + NCI_Free(tmp_buf); + } + + off += size; + done += size; + } + + for (i = 0; i < nprocs; i++) + count[i] = send_size[i] = 0; + for (m = ntimes; m < max_ntimes; m++) { + /* nothing to send, but check for recv. */ + MPI_Aint recved_bytes = 0; + R_Exchange_data(fd, buf, buf_view, send_size, recv_size, count, + start_pos, partial_send, recd_from_proc, nprocs, + myrank, min_st_offset, fd_size, fd_start, fd_end, + others_req, m, buf_idx, &recved_bytes); + actual_recved_bytes += recved_bytes; + } + + NCI_Free(curr_offlen_ptr); + + return actual_recved_bytes; +} + +static void R_Exchange_data(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, + MPI_Count * send_size, MPI_Count * recv_size, + MPI_Count * count, MPI_Count * start_pos, + MPI_Count * partial_send, MPI_Count * recd_from_proc, int nprocs, + int myrank, + MPI_Offset min_st_offset, MPI_Offset fd_size, + MPI_Offset * fd_start, MPI_Offset * fd_end, + PNCIO_Access * others_req, int iter, + MPI_Aint * buf_idx, MPI_Aint * actual_recved_bytes) +{ + int i, nprocs_recv, nprocs_send; + char **recv_buf = NULL; + size_t memLen; + MPI_Request *requests; + MPI_Datatype send_type; + MPI_Status *statuses; + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + double curT = MPI_Wtime(); +#endif + +/* exchange send_size info so that each process knows how much to + receive from whom and how much memory to allocate. */ + + MPI_Alltoall(send_size, 1, MPI_COUNT, recv_size, 1, MPI_COUNT, fd->comm); + + nprocs_recv = 0; + nprocs_send = 0; + memLen = 0; + for (i = 0; i < nprocs; i++) { + memLen += recv_size[i]; + if (recv_size[i]) + nprocs_recv++; + if (send_size[i]) + nprocs_send++; + } + + requests = (MPI_Request *) + NCI_Malloc((nprocs_send + nprocs_recv + 1) * sizeof(MPI_Request)); +/* +1 to avoid a 0-size malloc */ + +/* post recvs. if buf_view.is_contig, data can be directly recd. into + user buf at location given by buf_idx. else use recv_buf. */ + + MPI_Count j = 0; // think of this as a counter of non-zero sends/recs + if (buf_view.is_contig) { + for (i = 0; i < nprocs; i++) { + if (recv_size[i]) { +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Irecv_c(((char *) buf) + buf_idx[i], recv_size[i], + MPI_BYTE, i, 0, fd->comm, requests + j); +#else + MPI_Irecv(((char *) buf) + buf_idx[i], recv_size[i], + MPI_BYTE, i, 0, fd->comm, requests + j); +#endif + j++; + buf_idx[i] += recv_size[i]; + } + } + } else { + /* allocate memory for recv_buf and post receives */ + recv_buf = (char **) NCI_Malloc(nprocs * sizeof(char *)); + recv_buf[0] = (char *) NCI_Malloc(memLen); + for (i = 1; i < nprocs; i++) + recv_buf[i] = recv_buf[i - 1] + recv_size[i - 1]; + + j = 0; + for (i = 0; i < nprocs; i++) { + if (recv_size[i]) { +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Irecv_c(recv_buf[i], recv_size[i], MPI_BYTE, i, + 0, fd->comm, requests + j); +#else + MPI_Irecv(recv_buf[i], recv_size[i], MPI_BYTE, i, + 0, fd->comm, requests + j); +#endif + j++; + } + } + } + +/* create derived datatypes and send data */ + + j = 0; + for (i = 0; i < nprocs; i++) { + if (send_size[i]) { + /* take care if the last off-len pair is a partial send */ + MPI_Offset tmp = 0; + MPI_Count k = 0; + if (partial_send[i]) { + k = start_pos[i] + count[i] - 1; + tmp = others_req[i].lens[k]; + others_req[i].lens[k] = partial_send[i]; + } +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Type_create_hindexed_c(count[i], + &(others_req[i].lens[start_pos[i]]), + &(others_req[i].mem_ptrs[start_pos[i]]), + MPI_BYTE, &send_type); +#else + MPI_Type_create_hindexed(count[i], + &(others_req[i].lens[start_pos[i]]), + &(others_req[i].mem_ptrs[start_pos[i]]), + MPI_BYTE, &send_type); +#endif + /* absolute displacement; use MPI_BOTTOM in send */ + MPI_Type_commit(&send_type); + MPI_Isend(MPI_BOTTOM, 1, send_type, i, 0, + fd->comm, requests + nprocs_recv + j); + MPI_Type_free(&send_type); + if (partial_send[i]) + others_req[i].lens[k] = tmp; + j++; + } + } +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->read_timing[4] += MPI_Wtime() - curT; +#endif + + + /* +1 to avoid a 0-size malloc */ + statuses = (MPI_Status *) NCI_Malloc((nprocs_send + nprocs_recv + 1) * sizeof(MPI_Status)); + + /* wait on the receives */ + if (nprocs_recv) { +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + curT = MPI_Wtime(); +#endif + MPI_Waitall(nprocs_recv, requests, statuses); +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->read_timing[3] += MPI_Wtime() - curT; +#endif + + *actual_recved_bytes = 0; + j = 0; + for (i = 0; i < nprocs; i++) { + if (recv_size[i]) { +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count count_recved; + MPI_Get_count_c(&statuses[j], MPI_BYTE, &count_recved); +#else + int count_recved; + MPI_Get_count(&statuses[j], MPI_BYTE, &count_recved); +#endif + *actual_recved_bytes += count_recved; + j++; + } + } + + /* if noncontiguous, to the copies from the recv buffers */ + if (!buf_view.is_contig) + Fill_user_buffer(fd, buf, buf_view, recv_buf, recv_size, + recd_from_proc, nprocs, min_st_offset, + fd_size, fd_start, fd_end); + } + + /* wait on the sends */ +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + curT = MPI_Wtime(); +#endif +#ifdef HAVE_MPI_STATUSES_IGNORE + MPI_Waitall(nprocs_send, requests + nprocs_recv, MPI_STATUSES_IGNORE); +#else + MPI_Waitall(nprocs_send, requests + nprocs_recv, statuses + nprocs_recv); +#endif +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->read_timing[3] += MPI_Wtime() - curT; +#endif + + NCI_Free(statuses); + NCI_Free(requests); + + if (!buf_view.is_contig) { + NCI_Free(recv_buf[0]); + NCI_Free(recv_buf); + } +} + +#define BUF_INCR { \ + while (buf_incr) { \ + size_in_buf = MIN(buf_incr, flat_buf_sz); \ + user_buf_idx += size_in_buf; \ + flat_buf_sz -= size_in_buf; \ + buf_incr -= size_in_buf; \ + if (buf_incr > 0 && flat_buf_sz == 0) { \ + flat_buf_idx++; \ + user_buf_idx = buf_view.off[flat_buf_idx]; \ + flat_buf_sz = buf_view.len[flat_buf_idx]; \ + } \ + } \ +} + + +#define BUF_COPY { \ + while (size) { \ + size_in_buf = MIN(size, flat_buf_sz); \ + memcpy(((char *) buf) + user_buf_idx, \ + &(recv_buf[p][recv_buf_idx[p]]), size_in_buf); \ + recv_buf_idx[p] += size_in_buf; \ + user_buf_idx += size_in_buf; \ + flat_buf_sz -= size_in_buf; \ + size -= size_in_buf; \ + buf_incr -= size_in_buf; \ + if (size > 0 && flat_buf_sz == 0) { \ + flat_buf_idx++; \ + user_buf_idx = buf_view.off[flat_buf_idx]; \ + flat_buf_sz = buf_view.len[flat_buf_idx]; \ + } \ + } \ + BUF_INCR \ +} + +static void Fill_user_buffer(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, + char **recv_buf, + MPI_Count * recv_size, + MPI_Count * recd_from_proc, int nprocs, + MPI_Offset min_st_offset, + MPI_Offset fd_size, MPI_Offset * fd_start, + MPI_Offset * fd_end) +{ + +/* this function is only called if buftype is not contig */ + + int p, flat_buf_idx; + MPI_Offset flat_buf_sz, size_in_buf, buf_incr, size; + MPI_Offset off, user_buf_idx; + MPI_Offset len, rem_len; + MPI_Count *curr_from_proc, *done_from_proc, *recv_buf_idx; + +/* curr_from_proc[p] = amount of data recd from proc. p that has already + been accounted for so far + done_from_proc[p] = amount of data already recd from proc. p and + filled into user buffer in previous iterations + user_buf_idx = current location in user buffer + recv_buf_idx[p] = current location in recv_buf of proc. p */ + /* combining these three related arrays into a single memory allocation + * (the "times 3" here) can help some highly noncontiguous workloads a bit */ + curr_from_proc = NCI_Malloc(nprocs * 3 * sizeof(*curr_from_proc)); + done_from_proc = curr_from_proc + nprocs; + recv_buf_idx = done_from_proc + nprocs; + + for (int i = 0; i < nprocs; i++) { + recv_buf_idx[i] = curr_from_proc[i] = 0; + done_from_proc[i] = recd_from_proc[i]; + } + + user_buf_idx = buf_view.off[0]; + flat_buf_idx = 0; + flat_buf_sz = buf_view.len[0]; + + /* flat_buf_idx = current index into flattened buftype + * flat_buf_sz = size of current contiguous component in + * flattened buf */ + + for (MPI_Count i = 0; i < fd->flat_file.count; i++) { + off = fd->flat_file.off[i]; + rem_len = fd->flat_file.len[i]; + + /* this request may span the file domains of more than one process */ + while (rem_len != 0) { + len = rem_len; + /* NOTE: len value is modified by PNCIO_Calc_aggregator() to be no + * longer than the single region that processor "p" is responsible + * for. + */ + p = PNCIO_Calc_aggregator(fd, off, min_st_offset, &len, fd_size, fd_end); + + if (recv_buf_idx[p] < recv_size[p]) { + if (curr_from_proc[p] + len > done_from_proc[p]) { + if (done_from_proc[p] > curr_from_proc[p]) { + size = MIN(curr_from_proc[p] + len - done_from_proc[p], + recv_size[p] - recv_buf_idx[p]); + buf_incr = done_from_proc[p] - curr_from_proc[p]; + BUF_INCR + buf_incr = curr_from_proc[p] + len - done_from_proc[p]; + curr_from_proc[p] = done_from_proc[p] + size; + BUF_COPY + } else { + size = MIN(len, recv_size[p] - recv_buf_idx[p]); + buf_incr = len; + curr_from_proc[p] += size; + BUF_COPY + } + } else { + curr_from_proc[p] += len; + buf_incr = len; + BUF_INCR + } + } else { + buf_incr = len; + BUF_INCR + } + off += len; + rem_len -= len; + } + } + for (int i = 0; i < nprocs; i++) + if (recv_size[i]) + recd_from_proc[i] = curr_from_proc[i]; + + NCI_Free(curr_from_proc); +} diff --git a/src/drivers/pncio/pncio_read_str.c b/src/drivers/pncio/pncio_read_str.c new file mode 100644 index 0000000000..efbfe1a49f --- /dev/null +++ b/src/drivers/pncio/pncio_read_str.c @@ -0,0 +1,259 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + +#define BUFFERED_READ { \ + if (req_off >= readbuf_off + readbuf_len) { \ + readbuf_off = req_off; \ + readbuf_len = MIN(max_bufsize, end_offset-readbuf_off+1); \ + r_len = PNCIO_ReadContig(fd, readbuf, readbuf_len, readbuf_off); \ + if (r_len < 0) return r_len; \ + total_r_len += r_len; \ + } \ + while (req_len > readbuf_off + readbuf_len - req_off) { \ + partial_read = readbuf_off + readbuf_len - req_off; \ + tmp_buf = (char *) NCI_Malloc(partial_read); \ + memcpy(tmp_buf, readbuf+readbuf_len-partial_read, partial_read); \ + NCI_Free(readbuf); \ + readbuf = (char *) NCI_Malloc(partial_read + max_bufsize); \ + memcpy(readbuf, tmp_buf, partial_read); \ + NCI_Free(tmp_buf); \ + readbuf_off += readbuf_len-partial_read; \ + readbuf_len = partial_read + \ + MIN(max_bufsize, end_offset-readbuf_off+1); \ + r_len = PNCIO_ReadContig(fd, readbuf+partial_read, \ + readbuf_len-partial_read, \ + readbuf_off+partial_read); \ + if (r_len < 0) return r_len; \ + total_r_len += r_len; \ + } \ + memcpy((char*)buf+userbuf_off, readbuf+req_off-readbuf_off, req_len); \ +} + + +MPI_Offset PNCIO_GEN_ReadStrided(PNCIO_File *fd, + void *buf, + PNCIO_View buf_view, + MPI_Offset offset) +{ + char *readbuf, *tmp_buf, *value; + int i, j, k, st_index=0, info_flag; + + MPI_Aint max_bufsize, readbuf_len; + MPI_Offset i_offset, new_brd_size, brd_size, size, abs_off_in_filetype=0; + MPI_Offset new_frd_size, frd_size=0, st_frd_size, userbuf_off, req_len; + MPI_Offset sum, off, req_off, disp, end_offset=0, readbuf_off, start_off; + MPI_Offset r_len, total_r_len=0; + MPI_Count num, bufsize, partial_read; + +// printf("%s at %d:\n",__func__,__LINE__); + + if (fd->hints->romio_ds_read == PNCIO_HINT_DISABLE) { + /* if user has disabled data sieving on reads, use naive + * approach instead. + */ + return PNCIO_GEN_ReadStrided_naive(fd, buf, buf_view, offset); + } + +/* This subroutine is entered with filetype being non-contiguous only */ +assert(fd->filetype == MPI_BYTE); +if (fd->flat_file.count > 0) assert(offset == 0); /* not whole file visible */ + + bufsize = buf_view.size; + + /* get max_bufsize from the info object. */ + value = (char *) NCI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); + MPI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag); + max_bufsize = atoi(value); + NCI_Free(value); + + if (!buf_view.is_contig && fd->flat_file.is_contig) { + /* noncontiguous in memory, contiguous in file. */ + + off = fd->disp + offset; + + start_off = off; + end_offset = off + bufsize - 1; + readbuf_off = off; + readbuf = (char *) NCI_Malloc(max_bufsize); + readbuf_len = MIN(max_bufsize, end_offset - readbuf_off + 1); + + /* if atomicity is true, lock (exclusive) the region to be accessed */ + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + + r_len = PNCIO_ReadContig(fd, readbuf, readbuf_len, readbuf_off); + if (r_len < 0) return r_len; + + for (i = 0; i < buf_view.count; i++) { + userbuf_off = buf_view.off[i]; + req_off = off; + req_len = buf_view.len[i]; + BUFFERED_READ + off += buf_view.len[i]; + } + + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); + + NCI_Free(readbuf); + } + + else { /* noncontiguous in file */ + MPI_Offset size_in_filetype = offset; + + disp = fd->disp; + + sum = 0; + for (i = 0; i < fd->flat_file.count; i++) { + sum += fd->flat_file.len[i]; + if (sum > size_in_filetype) { + st_index = i; + frd_size = sum - size_in_filetype; + abs_off_in_filetype = fd->flat_file.off[i] + + size_in_filetype - (sum - fd->flat_file.len[i]); + break; + } + } + + /* abs. offset in bytes in the file */ + offset = disp + abs_off_in_filetype; + + start_off = offset; + + /* Wei-keng Liao: read request is within a single flat_file contig + * block e.g. with subarray types that actually describe the whole + * array */ + if (buf_view.is_contig && bufsize <= frd_size) { + /* a count of bytes can overflow. operate on original type instead */ + r_len = PNCIO_ReadContig(fd, buf, buf_view.size, offset); + +assert(buf_view.size == r_len); + return r_len; + } + + /* Calculate end_offset, the last byte-offset that will be accessed. + * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 */ + + st_frd_size = frd_size; + i_offset = 0; + j = st_index; + off = offset; + frd_size = MIN(st_frd_size, bufsize); + while (i_offset < bufsize) { + i_offset += frd_size; + end_offset = off + frd_size - 1; + +if (i_offset >= bufsize) break; + j++; + off = disp + fd->flat_file.off[j]; + frd_size = MIN(fd->flat_file.len[j], bufsize - i_offset); + } + + /* if atomicity is true, lock (exclusive) the region to be accessed */ + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + + readbuf_off = 0; + readbuf_len = 0; + readbuf = (char *) NCI_Malloc(max_bufsize); + + if (buf_view.is_contig && !fd->flat_file.is_contig) { + /* contiguous in memory, noncontiguous in file should be the most + * common case. + */ + i_offset = 0; + j = st_index; + off = offset; + frd_size = MIN(st_frd_size, bufsize); + while (i_offset < bufsize) { + if (frd_size) { + req_off = off; + req_len = frd_size; + userbuf_off = i_offset; + BUFFERED_READ + } + + i_offset += frd_size; + if (i_offset >= bufsize) break; + + if (off + frd_size < disp + fd->flat_file.off[j] + + fd->flat_file.len[j]) + off += frd_size; /* off is incremented by frd_size */ + else { + j++; +assert(j < fd->flat_file.count); + off = disp + fd->flat_file.off[j]; + frd_size = MIN(fd->flat_file.len[j], + bufsize - i_offset); + } + } + } else { + /* noncontiguous in memory as well as in file */ + k = num = 0; + i_offset = buf_view.off[0]; + j = st_index; + off = offset; + frd_size = st_frd_size; + brd_size = buf_view.len[0]; + + while (num < bufsize) { + size = MIN(frd_size, brd_size); + if (size) { + req_off = off; + req_len = size; + userbuf_off = i_offset; + BUFFERED_READ + } + + num += size; + if (num >= bufsize) break; + + new_frd_size = frd_size; + new_brd_size = brd_size; + + if (size == frd_size) { + /* reached end of contiguous block in file */ + j++; +assert(j < fd->flat_file.count); + off = disp + fd->flat_file.off[j]; + new_frd_size = fd->flat_file.len[j]; + if (size != brd_size) { + i_offset += size; + new_brd_size -= size; + } + } + + if (size == brd_size) { + /* reached end of contiguous block in memory */ + k++; +assert(k < buf_view.count); + i_offset = buf_view.off[k]; + new_brd_size = buf_view.len[k]; + if (size != frd_size) { + off += size; + new_frd_size -= size; + } + } + frd_size = new_frd_size; + brd_size = new_brd_size; + } + } + + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); + + NCI_Free(readbuf); /* malloced in the buffered_read macro */ + } + + assert(total_r_len >= buf_view.size); + + return buf_view.size; +} diff --git a/src/drivers/pncio/pncio_read_str_naive.c b/src/drivers/pncio/pncio_read_str_naive.c new file mode 100644 index 0000000000..fa003f4037 --- /dev/null +++ b/src/drivers/pncio/pncio_read_str_naive.c @@ -0,0 +1,246 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + +MPI_Offset PNCIO_GEN_ReadStrided_naive(PNCIO_File *fd, + void *buf, + PNCIO_View buf_view, + MPI_Offset offset) +{ + int b_index; + MPI_Offset size, brd_size, frd_size=0, req_len, sum, off, req_off, disp; + MPI_Offset end_offset=0, start_off, abs_off_in_filetype=0, userbuf_off; + MPI_Offset r_len, total_r_len=0; + MPI_Count bufsize; + +// printf("%s at %d:\n",__func__,__LINE__); + + if (fd->flat_file.size == 0) + return 0; + + bufsize = buf_view.size; + + /* contiguous in buftype and filetype is handled elsewhere */ + + if (!buf_view.is_contig && fd->flat_file.is_contig) { + /* noncontiguous in memory, contiguous in file. */ + + off = fd->disp + offset; + + start_off = off; + end_offset = off + bufsize - 1; + + /* if atomicity is true, lock (exclusive) the region to be accessed */ + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + + /* for each region in the buffer, grab the data and put it in place */ + for (b_index = 0; b_index < buf_view.count; b_index++) { + userbuf_off = buf_view.off[b_index]; + req_off = off; + req_len = buf_view.len[b_index]; + + r_len = PNCIO_ReadContig(fd, (char *) buf + userbuf_off, + req_len, req_off); + if (r_len < 0) return r_len; + total_r_len += r_len; + + /* off is (potentially) used to save the final offset later */ + off += buf_view.len[b_index]; + } + + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); + } + else { /* noncontiguous in file */ + MPI_Offset size_in_filetype = offset; + + int f_index, st_index = 0; + MPI_Offset st_frd_size; + + /* First we're going to calculate a set of values for use in all + * the noncontiguous in file cases: + * start_off - starting byte position of data in file + * end_offset - last byte offset to be accessed in the file + * st_index - index of block in first filetype that we will be + * starting in (?) + * st_frd_size - size of the data in the first filetype block + * that we will read (accounts for being part-way + * into reading this block of the filetype + * + */ + + disp = fd->disp; + + sum = 0; + for (f_index = 0; f_index < fd->flat_file.count; f_index++) { + sum += fd->flat_file.len[f_index]; + if (sum > size_in_filetype) { + st_index = f_index; + frd_size = sum - size_in_filetype; + abs_off_in_filetype = fd->flat_file.off[f_index] + + size_in_filetype - (sum - fd->flat_file.len[f_index]); + break; + } + } + + /* abs. offset in bytes in the file */ + start_off = disp + abs_off_in_filetype; + + st_frd_size = frd_size; + + /* start_off, st_index, and st_frd_size are + * all calculated at this point + */ + + /* Calculate end_offset, the last byte-offset that will be accessed. + * e.g., if start_off=0 and 100 bytes to be read, end_offset=99 + */ + f_index = st_index; + userbuf_off = frd_size = MIN(st_frd_size, bufsize); + end_offset = start_off + frd_size - 1; + while (userbuf_off < bufsize) { + f_index++; +assert(f_index < fd->flat_file.count); + + off = disp + fd->flat_file.off[f_index]; + frd_size = MIN(fd->flat_file.len[f_index], + bufsize - userbuf_off); + userbuf_off += frd_size; + end_offset = off + frd_size - 1; + } + + /* End of calculations. At this point the following values have + * been calculated and are ready for use: + * - start_off + * - end_offset + * - st_index + * - st_frd_size + */ + + /* if atomicity is true, lock (exclusive) the region to be accessed */ + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + + if (buf_view.is_contig && !fd->flat_file.is_contig) { + /* contiguous in memory, noncontiguous in file. should be the + * most common case. + */ + + userbuf_off = 0; + f_index = st_index; + off = start_off; + frd_size = MIN(st_frd_size, bufsize); + + /* while there is still space in the buffer, read more data */ + while (userbuf_off < bufsize) { + if (frd_size) { + /* TYPE_UB and TYPE_LB can result in + * frd_size = 0. save system call in such cases */ + req_off = off; + req_len = frd_size; + + r_len = PNCIO_ReadContig(fd, (char *) buf + userbuf_off, + req_len, req_off); + if (r_len < 0) return r_len; + total_r_len += r_len; + } + userbuf_off += frd_size; + if (userbuf_off >= bufsize) break; + + if (off + frd_size < disp + fd->flat_file.off[f_index] + + fd->flat_file.len[f_index]) { + /* important that this value be correct, as it is + * used to set the offset in the fd near the end of + * this function. + */ + off += frd_size; + } + /* did not reach end of contiguous block in filetype. + * no more I/O needed. off is incremented by frd_size. + */ + else { + f_index++; +assert(f_index < fd->flat_file.count); + off = disp + fd->flat_file.off[f_index]; + frd_size = MIN(fd->flat_file.len[f_index], + bufsize - userbuf_off); + } + } + } else { + MPI_Offset i_offset, tmp_bufsize = 0; + /* noncontiguous in memory as well as in file */ + + b_index = 0; + i_offset = buf_view.off[0]; + f_index = st_index; + off = start_off; + frd_size = st_frd_size; + brd_size = buf_view.len[0]; + + /* while we haven't read size * count bytes, keep going */ + while (tmp_bufsize < bufsize) { + MPI_Offset new_brd_size = brd_size, new_frd_size = frd_size; + + size = MIN(frd_size, brd_size); + /* keep max of a single read amount <= INT_MAX */ + size = MIN(size, INT_MAX); + + if (size) { + req_off = off; + req_len = size; + userbuf_off = i_offset; + + r_len = PNCIO_ReadContig(fd, (char *) buf + userbuf_off, + req_len, req_off); + if (r_len < 0) return r_len; + total_r_len += r_len; + } + + tmp_bufsize += size; + if (tmp_bufsize >= bufsize) break; + + if (size == frd_size) { + /* reached end of contiguous block in file */ + f_index++; +assert(f_index < fd->flat_file.count); + off = disp + fd->flat_file.off[f_index]; + + new_frd_size = fd->flat_file.len[f_index]; + if (size != brd_size) { + i_offset += size; + new_brd_size -= size; + } + } + + if (size == brd_size) { + /* reached end of contiguous block in memory */ + b_index++; +assert(b_index < buf_view.count); + i_offset = buf_view.off[b_index]; + new_brd_size = buf_view.len[b_index]; + if (size != frd_size) { + off += size; + new_frd_size -= size; + } + } + frd_size = new_frd_size; + brd_size = new_brd_size; + } + } + + /* unlock the file region if we locked it */ + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); + + } /* end of (else noncontiguous in file) */ + + return total_r_len; +} diff --git a/src/drivers/pncio/pncio_set_size.c b/src/drivers/pncio/pncio_set_size.c new file mode 100644 index 0000000000..77490f4811 --- /dev/null +++ b/src/drivers/pncio/pncio_set_size.c @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include /* strdup() */ +#include +#include +#ifdef HAVE_UNISTD_H +#include /* ftruncate(), lseek() */ +#endif + +#include + +#include +#include +#include "pncio.h" + +/*----< PNCIO_File_set_size() >-----------------------------------------------*/ +int PNCIO_File_set_size(PNCIO_File *fd, + MPI_Offset size) +{ + int err = NC_NOERR, rank; + + MPI_Comm_rank(fd->comm, &rank); + + if (rank == 0) { + err = ftruncate(fd->fd_sys, (off_t) size); + if (err != 0) + err = ncmpii_error_posix2nc("ftruncate"); + } + + MPI_Bcast(&err, 1, MPI_INT, 0, fd->comm); + + return err; +} + +/*----< PNCIO_File_get_size() >-----------------------------------------------*/ +int PNCIO_File_get_size(PNCIO_File *fd, + MPI_Offset *size) +{ + int err = NC_NOERR, rank; + MPI_Offset msg[2]; + + MPI_Comm_rank(fd->comm, &rank); + + if (rank == 0) { + *size = lseek(fd->fd_sys, 0, SEEK_END); + if (*size == -1) + err = ncmpii_error_posix2nc("lseek"); + msg[0] = err; + msg[1] = *size; + } + + MPI_Bcast(msg, 2, MPI_OFFSET, 0, fd->comm); + err = (int)msg[0]; + *size = msg[1]; + + return err; +} + diff --git a/src/drivers/pncio/pncio_set_view.c b/src/drivers/pncio/pncio_set_view.c new file mode 100644 index 0000000000..ddf41e968d --- /dev/null +++ b/src/drivers/pncio/pncio_set_view.c @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include /* strdup() */ +#include +#include + +#include + +#include +#include +#include "pncio.h" + +/*----< PNCIO_File_set_view() >-----------------------------------------------*/ +/* For PnetCDF, this subroutine is an independent call, because PnetCDF only + * use the followings. + * Argument etype is always MPI_BYTE. + * Argument datarep is always "native". + * Argument info is always MPI_INFO_NULL. + */ +int PNCIO_File_set_view(PNCIO_File *fd, + MPI_Offset disp, + MPI_Datatype filetype, + MPI_Aint npairs, +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *offsets, + MPI_Count *lengths +#else + MPI_Offset *offsets, + int *lengths +#endif +) +{ + MPI_Aint i; + +assert(filetype == MPI_BYTE); +assert(disp == 0); +fd->filetype = filetype; +fd->disp = 0; + + fd->flat_file.count = npairs; + fd->flat_file.off = offsets; + fd->flat_file.len = lengths; + fd->flat_file.idx = 0; + fd->flat_file.rem = (npairs > 0) ? lengths[0] : 0; + + /* Size of fileview must be calculated here, as PnetCDF may coalesce the + * offset-length pairs in order to make offsets sorted in a monotonically + * non-decreasing order. + */ + fd->flat_file.size = 0; + for (i=0; iflat_file.size += lengths[i]; + + /* is_contig is redundant to (count <= 1), but convenient */ + fd->flat_file.is_contig = (npairs <= 1); + + return NC_NOERR; +} + diff --git a/src/drivers/pncio/pncio_sync.c b/src/drivers/pncio/pncio_sync.c new file mode 100644 index 0000000000..49dc31bffc --- /dev/null +++ b/src/drivers/pncio/pncio_sync.c @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include /* strdup() */ +#include +#include +#ifdef HAVE_UNISTD_H +#include /* fsync(), unlink(), ftruncate(), lseek() */ +#endif + +#include + +#include +#include +#include "pncio.h" + +/*----< PNCIO_File_sync() >---------------------------------------------------*/ +int PNCIO_File_sync(PNCIO_File *fd) +{ + int err = NC_NOERR; + + if (fd->is_open > 0) { + err = fsync(fd->fd_sys); + if (err != 0) + err = ncmpii_error_posix2nc("fsync"); + } + + return err; +} + diff --git a/src/drivers/pncio/pncio_utils.c b/src/drivers/pncio/pncio_utils.c new file mode 100644 index 0000000000..c4c1629e14 --- /dev/null +++ b/src/drivers/pncio/pncio_utils.c @@ -0,0 +1,185 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include /* va_start(), va_end() */ + +#include + +/* some systems do not have pread/pwrite, or requrie XOPEN_SOURCE set higher + * than we would like. see #1973 */ +#if (HAVE_DECL_PWRITE == 0) + +#include +#include + +ssize_t pread(int fd, void *buf, size_t count, off_t offset); +ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset); + +ssize_t pread(int fd, void *buf, size_t count, off_t offset) +{ + off_t lseek_ret; + off_t old_offset; + ssize_t read_ret; + + old_offset = lseek(fd, 0, SEEK_CUR); + lseek_ret = lseek(fd, offset, SEEK_SET); + if (lseek_ret == -1) + return lseek_ret; + read_ret = read(fd, buf, count); + if (read_ret < 0) + return read_ret; + /* man page says "file offset is not changed" */ + if ((lseek_ret = lseek(fd, old_offset, SEEK_SET)) < 0) + return lseek_ret; + + return read_ret; +} + +ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset) +{ + off_t lseek_ret; + off_t old_offset; + ssize_t write_ret; + + old_offset = lseek(fd, 0, SEEK_CUR); + lseek_ret = lseek(fd, offset, SEEK_SET); + if (lseek_ret == -1) + return lseek_ret; + write_ret = write(fd, buf, count); + if (write_ret < 0) + return write_ret; + /* man page says "file offset is not changed" */ + if ((lseek_ret = lseek(fd, old_offset, SEEK_SET)) < 0) + return lseek_ret; + + return write_ret; +} +#endif + +void PNCIO_Heap_merge(PNCIO_Access * others_req, MPI_Count * count, + MPI_Offset * srt_off, MPI_Count * srt_len, MPI_Count * start_pos, + int nprocs, int nprocs_recv, MPI_Count total_elements) +{ + typedef struct { + MPI_Offset *off_list; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset *len_list; +#else + int *len_list; +#endif + MPI_Count nelem; + } heap_struct; + + heap_struct *a, tmp; + int i, j, heapsize, l, r, k, smallest; + + a = (heap_struct *) NCI_Malloc((nprocs_recv + 1) * sizeof(heap_struct)); + + j = 0; + for (i = 0; i < nprocs; i++) + if (count[i]) { + a[j].off_list = &(others_req[i].offsets[start_pos[i]]); + a[j].len_list = &(others_req[i].lens[start_pos[i]]); + a[j].nelem = count[i]; + j++; + } + + /* build a heap out of the first element from each list, with + * the smallest element of the heap at the root */ + + heapsize = nprocs_recv; + for (i = heapsize / 2 - 1; i >= 0; i--) { + /* Heapify(a, i, heapsize); Algorithm from Cormen et al. pg. 143 + * modified for a heap with smallest element at root. I have + * removed the recursion so that there are no function calls. + * Function calls are too expensive. */ + k = i; + for (;;) { + l = 2 * (k + 1) - 1; + r = 2 * (k + 1); + + if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list))) + smallest = l; + else + smallest = k; + + if ((r < heapsize) && (*(a[r].off_list) < *(a[smallest].off_list))) + smallest = r; + + if (smallest != k) { + tmp.off_list = a[k].off_list; + tmp.len_list = a[k].len_list; + tmp.nelem = a[k].nelem; + + a[k].off_list = a[smallest].off_list; + a[k].len_list = a[smallest].len_list; + a[k].nelem = a[smallest].nelem; + + a[smallest].off_list = tmp.off_list; + a[smallest].len_list = tmp.len_list; + a[smallest].nelem = tmp.nelem; + + k = smallest; + } else + break; + } + } + + for (i = 0; i < total_elements; i++) { + /* extract smallest element from heap, i.e. the root */ + srt_off[i] = *(a[0].off_list); + srt_len[i] = *(a[0].len_list); + (a[0].nelem)--; + + if (!a[0].nelem) { + a[0].off_list = a[heapsize - 1].off_list; + a[0].len_list = a[heapsize - 1].len_list; + a[0].nelem = a[heapsize - 1].nelem; + heapsize--; + } else { + (a[0].off_list)++; + (a[0].len_list)++; + } + + /* Heapify(a, 0, heapsize); */ + k = 0; + for (;;) { + l = 2 * (k + 1) - 1; + r = 2 * (k + 1); + + if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list))) + smallest = l; + else + smallest = k; + + if ((r < heapsize) && (*(a[r].off_list) < *(a[smallest].off_list))) + smallest = r; + + if (smallest != k) { + tmp.off_list = a[k].off_list; + tmp.len_list = a[k].len_list; + tmp.nelem = a[k].nelem; + + a[k].off_list = a[smallest].off_list; + a[k].len_list = a[smallest].len_list; + a[k].nelem = a[smallest].nelem; + + a[smallest].off_list = tmp.off_list; + a[smallest].len_list = tmp.len_list; + a[smallest].nelem = tmp.nelem; + + k = smallest; + } else + break; + } + } + NCI_Free(a); +} + diff --git a/src/drivers/pncio/pncio_write.c b/src/drivers/pncio/pncio_write.c new file mode 100644 index 0000000000..debd07e1d9 --- /dev/null +++ b/src/drivers/pncio/pncio_write.c @@ -0,0 +1,161 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include /* pwrite() */ + +#include + +#include "pncio.h" + +#ifdef WKL_DEBUG +int first_ost_id; +#endif + +/*----< PNCIO_WriteContig() >-------------------------------------------------*/ +MPI_Offset PNCIO_WriteContig(PNCIO_File *fd, + const void *buf, + MPI_Offset w_size, + MPI_Offset offset) +{ + ssize_t err = 0; + size_t w_count; + MPI_Offset bytes_xfered = 0; + char *p; + + if (w_size == 0) return NC_NOERR; + +// printf("%s at %d: pwrite offset=%lld w_size=%lld\n",__func__,__LINE__,offset,w_size); +#ifdef WKL_DEBUG +int rank; MPI_Comm_rank(MPI_COMM_WORLD,&rank); + +MPI_Offset ost_id = (offset / fd->hints->striping_unit) % fd->hints->striping_factor; + if (first_ost_id == -1) { + first_ost_id = ost_id; + // printf("%2d %s file %s First pwrite offset=%lld OST %d\n",rank,__func__,fd->filename,offset,first_ost_id); + } + else if (ost_id != first_ost_id) + printf("%2d Error: %s pwrite offset=%lld w_size=%lld ost_id=%lld not same 1st ost %d\n",rank,__func__,offset,w_size,ost_id,first_ost_id); + +printf("%s line %d: disp=%lld offset=%lld count=%ld bufType_size=%d w_size=%lld\n",__func__,__LINE__,fd->disp,offset,count,bufType_size,w_size); + + printf("%2d %s line %d pwrite offset=%lld w_size=%lld\n",rank,__func__,__LINE__,offset,w_size); +#endif + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + double timing = MPI_Wtime(); +#endif + p = (char *) buf; + while (bytes_xfered < w_size) { + w_count = w_size - bytes_xfered; + err = pwrite(fd->fd_sys, p, w_count, offset + bytes_xfered); + if (err == -1) + goto ioerr; + if (err == 0) + break; + bytes_xfered += err; + p += err; + } +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + fd->write_timing[2] += MPI_Wtime() - timing; +#endif + +ioerr: + if (err == -1) + bytes_xfered = ncmpii_error_posix2nc("pwrite"); + + return bytes_xfered; +} + +/*----< file_write() >-------------------------------------------------------*/ +/* This is an independent call. */ +static +MPI_Offset file_write(PNCIO_File *fd, + MPI_Offset offset, + const void *buf, + PNCIO_View buf_view) +{ + MPI_Offset w_len; + + if (buf_view.size == 0) /* zero-sized request */ + return NC_NOERR; + +assert(fd->filetype == MPI_BYTE); + + if (buf_view.is_contig && fd->flat_file.is_contig) { + if (fd->flat_file.count > 0) offset += fd->flat_file.off[0]; + w_len = PNCIO_WriteContig(fd, buf, buf_view.size, offset); + } + else if (fd->file_system == PNCIO_LUSTRE) + w_len = PNCIO_LUSTRE_WriteStrided(fd, buf, buf_view, offset); + else if (fd->file_system == PNCIO_UFS) + w_len = PNCIO_GEN_WriteStrided(fd, buf, buf_view, offset); + else + return NC_EFSTYPE; + + return w_len; /* when w_len < 0, it is an NetCDF error code */ +} + +/*----< PNCIO_File_write_at() >-----------------------------------------------*/ +/* This is an independent call. + * offset is a position in the file relative to the current view, expressed as + * a count of etypes. + */ +MPI_Offset PNCIO_File_write_at(PNCIO_File *fh, + MPI_Offset offset, + const void *buf, + PNCIO_View buf_view) +{ + assert(fh != NULL); + + if (buf_view.size == 0) /* zero-sized request */ + return NC_NOERR; + + if (buf_view.size < 0) return NC_ENEGATIVECNT; + + if (fh->access_mode & MPI_MODE_RDONLY) + return NC_EPERM; + + return file_write(fh, offset, buf, buf_view); +} + +/*----< PNCIO_File_write_at_all() >-------------------------------------------*/ +/* This is a collective call. + * offset is a position in the file relative to the current view, expressed as + * a count of etypes. + */ +MPI_Offset PNCIO_File_write_at_all(PNCIO_File *fh, + MPI_Offset offset, + const void *buf, + PNCIO_View buf_view) +{ + int err=NC_NOERR; + MPI_Offset w_len; + + assert(fh != NULL); + + if (buf_view.size < 0) err = NC_ENEGATIVECNT; + + if (fh->access_mode & MPI_MODE_RDONLY && err == NC_NOERR) + err = NC_EPERM; + + if (fh->file_system == PNCIO_LUSTRE) + w_len = PNCIO_LUSTRE_WriteStridedColl(fh, buf, buf_view, offset); + else if (fh->file_system == PNCIO_UFS) + w_len = PNCIO_GEN_WriteStridedColl(fh, buf, buf_view, offset); + else + return NC_EFSTYPE; + + return (err == NC_NOERR) ? w_len : err; +} + + diff --git a/src/drivers/pncio/pncio_write_coll.c b/src/drivers/pncio/pncio_write_coll.c new file mode 100644 index 0000000000..a9c4796e87 --- /dev/null +++ b/src/drivers/pncio/pncio_write_coll.c @@ -0,0 +1,931 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include "pncio.h" + +/* prototypes of functions used for collective writes only. */ +static MPI_Offset Exch_and_write(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, int nprocs, int myrank, + PNCIO_Access *others_req, + MPI_Offset min_st_offset, MPI_Offset fd_size, + MPI_Offset * fd_start, MPI_Offset * fd_end, + MPI_Aint * buf_idx); + +static MPI_Offset W_Exchange_data(PNCIO_File *fd, void *buf, char *write_buf, + PNCIO_View buf_view, + MPI_Count * send_size, MPI_Count * recv_size, + MPI_Offset off, MPI_Count size, /* 10 */ + MPI_Count * count, MPI_Count * start_pos, + MPI_Count * partial_recv, MPI_Count * + sent_to_proc, int nprocs, + int myrank, + MPI_Offset min_st_offset, MPI_Offset fd_size, + MPI_Offset * fd_start, MPI_Offset * fd_end, + PNCIO_Access * others_req, + MPI_Count *send_buf_idx, MPI_Count *curr_to_proc, + MPI_Count *done_to_proc, int *hole, int iter, + MPI_Aint * buf_idx); + +static void Fill_send_buffer(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, char **send_buf, + MPI_Count *send_size, MPI_Request *requests, + MPI_Count *sent_to_proc, int nprocs, int myrank, + MPI_Offset min_st_offset, + MPI_Offset fd_size, MPI_Offset *fd_start, + MPI_Offset *fd_end, MPI_Count *send_buf_idx, + MPI_Count *curr_to_proc, MPI_Count *done_to_proc, int iter); + +MPI_Offset PNCIO_GEN_WriteStridedColl(PNCIO_File *fd, + const void *buf, + PNCIO_View buf_view, + MPI_Offset offset) /* relative to fileview */ +{ + /* Uses a generalized version of the extended two-phase method described in + * "An Extended Two-Phase Method for Accessing Sections of Out-of-Core + * Arrays", Rajeev Thakur and Alok Choudhary, Scientific Programming, + * (5)4:301--317, Winter 1996. + * http://www.mcs.anl.gov/home/thakur/ext2ph.ps + */ + + PNCIO_Access *my_req; + /* array of nprocs access structures, one for each other process in + * whose file domain this process's request lies */ + + PNCIO_Access *others_req; + /* array of nprocs access structures, one for each other process + * whose request lies in this process's file domain. */ + + int i, nprocs, nprocs_for_coll, myrank, interleave_count=0; + MPI_Aint *buf_idx = NULL; + MPI_Count *count_my_req_per_proc, count_my_req_procs; + MPI_Count *count_others_req_per_proc, count_others_req_procs; + MPI_Offset start_offset, end_offset, fd_size, min_st_offset; + MPI_Offset *st_offsets=NULL, *fd_start=NULL; + MPI_Offset *fd_end=NULL, *end_offsets=NULL, w_len=0; + +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset one_len = (MPI_Offset)buf_view.size; +#else + int one_len = (int)buf_view.size; +#endif + +// printf("%s at %d: offset=%lld buf_view.size=%lld flat_file.count %lld size %lld is_contig %d\n",__func__,__LINE__, offset,buf_view.size,fd->flat_file.count, fd->flat_file.size, fd->flat_file.is_contig); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) +double curT = MPI_Wtime(); +#endif + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + + /* PnetCDF never reuses a fileview across two or more PNCIO calls. As this + * subroutine may modify the contents of fd->flat_file, we save its + * contents and restore it before leaving this sibroutine. + */ + PNCIO_View saved_flat_file = fd->flat_file; + + if (fd->flat_file.count == 0) { /* whole file is visible */ + /* set flat_file as a single contiguous offset-length pair */ + fd->flat_file.off = &offset; + fd->flat_file.len = &one_len; + fd->flat_file.size = one_len; + fd->flat_file.count = 1; + fd->flat_file.is_contig = 1; + start_offset = offset; + end_offset = offset + buf_view.size - 1; + } + else { + /* When flat_file is not contiguous, PnetCDF always calls this + * subroutine with offset == 0. + */ + assert(offset == 0); + } + + /* the number of processes that actually perform I/O, nprocs_for_coll, is + * stored in the hints off the PNCIO_File structure + */ + nprocs_for_coll = fd->hints->cb_nodes; + + /* only check for interleaving if romio_cb_write isn't disabled */ + if (fd->hints->romio_cb_write != PNCIO_HINT_DISABLE) { + /* For this process's request, calculate the file start and end + * offsets. Note: end_offset points to the last byte-offset that will + * be accessed, e.g., if start_offset=0 and 100 bytes to be read, + * end_offset=99. + * + * Note flat_file.off[] is always relative to beginning of file. + */ + start_offset = fd->flat_file.off[0]; + end_offset = fd->flat_file.off[fd->flat_file.count-1] + + fd->flat_file.len[fd->flat_file.count-1] - 1; + + /* Each process communicates its start and end offsets to other + * processes. The result is an array each of start and end offsets + * stored in order of process rank. + */ + + st_offsets = (MPI_Offset *) NCI_Malloc(nprocs * 2 * sizeof(MPI_Offset)); + end_offsets = st_offsets + nprocs; + + MPI_Allgather(&start_offset, 1, MPI_OFFSET, st_offsets, 1, MPI_OFFSET, + fd->comm); + MPI_Allgather(&end_offset, 1, MPI_OFFSET, end_offsets, 1, MPI_OFFSET, + fd->comm); + + /* Are the accesses of different processes interleaved? Below is a + * rudimentary check for interleaving, but should suffice for the + * moment. + */ + for (i = 1; i < nprocs; i++) + if (st_offsets[i] < end_offsets[i - 1] && + st_offsets[i] <= end_offsets[i]) + interleave_count++; + } + + if (fd->hints->romio_cb_write == PNCIO_HINT_DISABLE || + (!interleave_count && (fd->hints->romio_cb_write == PNCIO_HINT_AUTO))) { + + /* use independent accesses */ + if (fd->hints->romio_cb_write != PNCIO_HINT_DISABLE) + NCI_Free(st_offsets); + + /* restore flattend file view before leaving this sibroutine */ + fd->flat_file = saved_flat_file; + + if (buf_view.size == 0) /* zero_sized request */ + return 0; + + if (buf_view.is_contig && fd->flat_file.is_contig) { + /* When fd->flat_file.is_contig, it is still possible + * fd->flat_file.count > 0 and when this happens + * fd->flat_file.count should be 1, which comes from PnetCDF wait + * when the number of nonblocking requests is 1. + */ + if (fd->flat_file.count > 0) offset += fd->flat_file.off[0]; + w_len = PNCIO_WriteContig(fd, buf, buf_view.size, offset); + } + else + w_len = PNCIO_GEN_WriteStrided(fd, buf, buf_view, offset); + + return w_len; + } + + /* Divide the I/O workload among "nprocs_for_coll" processes. This is done + * by (logically) dividing the file into file domains (FDs); each process + * may directly access only its own file domain. + */ + PNCIO_Calc_file_domains(st_offsets, end_offsets, nprocs, nprocs_for_coll, + &min_st_offset, &fd_start, &fd_end, &fd_size, + fd->hints->striping_unit); + + /* calculate what portions of the access requests of this process are + * located in what file domains + */ + PNCIO_Calc_my_req(fd, min_st_offset, fd_end, fd_size, nprocs, + &count_my_req_procs, &count_my_req_per_proc, &my_req, + &buf_idx); + +/* based on everyone's my_req, calculate what requests of other + processes lie in this process's file domain. + count_others_req_procs = number of processes whose requests lie in + this process's file domain (including this process itself) + count_others_req_per_proc[i] indicates how many separate contiguous + requests of proc. i lie in this process's file domain. */ + + PNCIO_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, + my_req, nprocs, myrank, &count_others_req_procs, + &count_others_req_per_proc, &others_req); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->write_timing[1] += MPI_Wtime() - curT; +#endif + +/* exchange data and write in sizes of no more than coll_bufsize. */ + /* Cast away const'ness for the below function */ + w_len = Exch_and_write(fd, (char *) buf, buf_view, nprocs, myrank, + others_req, min_st_offset, fd_size, fd_start, + fd_end, buf_idx); + + /* If this collective write is followed by an independent write, + * it's possible to have those subsequent writes on other processes + * race ahead and sneak in before the read-modify-write completes. + * We carry out a collective communication at the end here so no one + * can start independent i/o before collective I/O completes. + * + * need to do some gymnastics with the error codes so that if something + * went wrong, all processes report error, but if a process has a more + * specific error code, we can still have that process report the + * additional information */ + + /* optimization: if only one process performing i/o, we can perform + * a less-expensive Bcast + */ + if (fd->hints->cb_nodes == 1) + MPI_Bcast(&w_len, 1, MPI_OFFSET, fd->hints->ranklist[0], fd->comm); + else + MPI_Allreduce(MPI_IN_PLACE, &w_len, 1, MPI_OFFSET, MPI_MIN, fd->comm); + + /* free all memory allocated for collective I/O */ + PNCIO_Free_my_req(count_my_req_per_proc, my_req, buf_idx); + PNCIO_Free_others_req(count_others_req_per_proc, others_req); + + NCI_Free(st_offsets); + NCI_Free(fd_start); + + /* restore flattend file view before leaving this sibroutine */ + fd->flat_file = saved_flat_file; + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->write_timing[0] += MPI_Wtime() - curT; +#endif + + /* w_len may not be the same as buf_view.size, because data sieving may + * write more than requested. + */ + return buf_view.size; +} + +/* If successful, it returns the amount written. Otherwise a NetCDF error code + * (negative value) is returned. + */ +static +MPI_Offset Exch_and_write(PNCIO_File *fd, void *buf, PNCIO_View buf_view, + int nprocs, + int myrank, + PNCIO_Access *others_req, + MPI_Offset min_st_offset, MPI_Offset fd_size, + MPI_Offset * fd_start, MPI_Offset * fd_end, + MPI_Aint * buf_idx) +{ +/* Send data to appropriate processes and write in sizes of no more + than coll_bufsize. + The idea is to reduce the amount of extra memory required for + collective I/O. If all data were written all at once, which is much + easier, it would require temp space more than the size of user_buf, + which is often unacceptable. For example, to write a distributed + array to a file, where each local array is 8Mbytes, requiring + at least another 8Mbytes of temp space is unacceptable. */ + + /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets */ + MPI_Offset size=0, w_len, total_w_len=0; + int hole, i, m, ntimes, max_ntimes; + MPI_Offset st_loc = -1, end_loc = -1, off, done, req_off; + char *write_buf = NULL; + MPI_Count *curr_offlen_ptr, *send_size, *count, req_len, *recv_size; + MPI_Count *partial_recv, *sent_to_proc, *start_pos; + int flag; + MPI_Count *send_buf_idx, *curr_to_proc, *done_to_proc; + int info_flag; + MPI_Aint coll_bufsize; + char *value; + + /* only I/O errors are currently reported */ + +/* calculate the number of writes of size coll_bufsize + to be done by each process and the max among all processes. + That gives the no. of communication phases as well. */ + + value = (char *) NCI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); + MPI_Info_get(fd->info, "cb_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag); + coll_bufsize = atoi(value); + NCI_Free(value); + + + for (i = 0; i < nprocs; i++) { + if (others_req[i].count) { + st_loc = others_req[i].offsets[0]; + end_loc = others_req[i].offsets[0]; + break; + } + } + + for (i = 0; i < nprocs; i++) + for (MPI_Count j = 0; j < others_req[i].count; j++) { + st_loc = MIN(st_loc, others_req[i].offsets[j]); + end_loc = MAX(end_loc, (others_req[i].offsets[j] + + others_req[i].lens[j] - 1)); + } + +/* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/ + + ntimes = (int) ((end_loc - st_loc + coll_bufsize) / coll_bufsize); + + if ((st_loc == -1) && (end_loc == -1)) { + ntimes = 0; /* this process does no writing. */ + } + + MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + fd->write_counter[0] = MAX(fd->write_counter[0], max_ntimes); +#endif + + write_buf = fd->io_buf; + + curr_offlen_ptr = NCI_Calloc(nprocs * 10, sizeof(*curr_offlen_ptr)); + /* its use is explained below. calloc initializes to 0. */ + + count = curr_offlen_ptr + nprocs; + /* to store count of how many off-len pairs per proc are satisfied + * in an iteration. */ + + partial_recv = count + nprocs; + /* if only a portion of the last off-len pair is recd. from a process + * in a particular iteration, the length recd. is stored here. + * calloc initializes to 0. */ + + send_size = partial_recv + nprocs; + /* total size of data to be sent to each proc. in an iteration. + * Of size nprocs so that I can use MPI_Alltoall later. */ + + recv_size = send_size + nprocs; + /* total size of data to be recd. from each proc. in an iteration. */ + + sent_to_proc = recv_size + nprocs; + /* amount of data sent to each proc so far. Used in + * Fill_send_buffer. initialized to 0 here. */ + + send_buf_idx = sent_to_proc + nprocs; + curr_to_proc = send_buf_idx + nprocs; + done_to_proc = curr_to_proc + nprocs; + /* Above three are used in Fill_send_buffer */ + + start_pos = done_to_proc + nprocs; + /* used to store the starting value of curr_offlen_ptr[i] in + * this iteration */ + + done = 0; + off = st_loc; +// printf("%s at %d: off=%lld buf_view.size=%lld ntimes=%d\n",__func__,__LINE__, off,buf_view.size,ntimes); + + for (m = 0; m < ntimes; m++) { + /* go through all others_req and check which will be satisfied + * by the current write */ + + /* Note that MPI guarantees that displacements in filetypes are in + * monotonically nondecreasing order and that, for writes, the + * filetypes cannot specify overlapping regions in the file. This + * simplifies implementation a bit compared to reads. */ + + /* off = start offset in the file for the data to be written in + * this iteration + * size = size of data written (bytes) corresponding to off + * req_off = off in file for a particular contiguous request + * minus what was satisfied in previous iteration + * req_size = size corresponding to req_off */ + + /* first calculate what should be communicated */ + + for (i = 0; i < nprocs; i++) + count[i] = recv_size[i] = 0; + + size = MIN(coll_bufsize, end_loc - st_loc + 1 - done); + + for (i = 0; i < nprocs; i++) { + if (others_req[i].count) { + start_pos[i] = curr_offlen_ptr[i]; + MPI_Count j; + for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) { + if (partial_recv[i]) { + /* this request may have been partially + * satisfied in the previous iteration. */ + req_off = others_req[i].offsets[j] + partial_recv[i]; + req_len = others_req[i].lens[j] - partial_recv[i]; + partial_recv[i] = 0; + /* modify the off-len pair to reflect this change */ + others_req[i].offsets[j] = req_off; + others_req[i].lens[j] = req_len; + } else { + req_off = others_req[i].offsets[j]; + req_len = others_req[i].lens[j]; + } + if (req_off < off + size) { + count[i]++; + if (myrank != i) { + MPI_Aint addr; + MPI_Get_address(write_buf + req_off - off, &addr); + others_req[i].mem_ptrs[j] = addr; + } + else + others_req[i].mem_ptrs[j] = req_off - off; + recv_size[i] += MIN(off + size - req_off, req_len); + + if (off + size - req_off < req_len) { + partial_recv[i] = (off + size - req_off); + + /* --BEGIN ERROR HANDLING-- */ + if ((j + 1 < others_req[i].count) && + (others_req[i].offsets[j + 1] < off + size)) { + /* This error should not happen to PnetCDF, as + * fileview is checked before entering this + * subroutine. + */ + fprintf(stderr, "Filetype specifies overlapping write regions (which is illegal according to the MPI-2 specification\n"); + /* allow to continue since additional + * communication might have to occur + */ + return NC_EFILE; + } + /* --END ERROR HANDLING-- */ + break; + } + } else + break; + } + curr_offlen_ptr[i] = j; + } + } + + w_len = W_Exchange_data(fd, buf, write_buf, buf_view, send_size, + recv_size, off, size, count, start_pos, + partial_recv, sent_to_proc, nprocs, myrank, + min_st_offset, fd_size, fd_start, fd_end, + others_req, send_buf_idx, curr_to_proc, + done_to_proc, &hole, m, buf_idx); + + if (w_len < 0) + return w_len; + else + total_w_len += w_len; + + flag = 0; + for (i = 0; i < nprocs; i++) + if (count[i]) + flag = 1; + + if (flag) { + w_len = PNCIO_WriteContig(fd, write_buf, size, off); + if (w_len < 0) + return w_len; + else + total_w_len += w_len; + } + + off += size; + done += size; + } + + for (i = 0; i < nprocs; i++) + count[i] = recv_size[i] = 0; + for (m = ntimes; m < max_ntimes; m++) { + /* nothing to recv, but check for send. */ + w_len = W_Exchange_data(fd, buf, write_buf, buf_view, send_size, + recv_size, off, size, count, start_pos, + partial_recv, sent_to_proc, nprocs, myrank, + min_st_offset, fd_size, fd_start, fd_end, + others_req, send_buf_idx, curr_to_proc, + done_to_proc, &hole, m, buf_idx); + if (w_len < 0) + return w_len; + else + total_w_len += w_len; + } + + NCI_Free(curr_offlen_ptr); + + return total_w_len; +} + + +/* Sets error_code to MPI_SUCCESS if successful, or creates an error code + * in the case of error. + */ +static +MPI_Offset W_Exchange_data(PNCIO_File *fd, void *buf, char *write_buf, + PNCIO_View buf_view, + MPI_Count *send_size, MPI_Count *recv_size, + MPI_Offset off, MPI_Count size, + MPI_Count *count, MPI_Count * start_pos, + MPI_Count *partial_recv, + MPI_Count *sent_to_proc, int nprocs, + int myrank, + MPI_Offset min_st_offset, + MPI_Offset fd_size, + MPI_Offset * fd_start, MPI_Offset * fd_end, + PNCIO_Access * others_req, + MPI_Count * send_buf_idx, MPI_Count * curr_to_proc, + MPI_Count * done_to_proc, int *hole, int iter, + MPI_Aint *buf_idx) +{ + int i, j, nprocs_recv, nprocs_send, err=NC_NOERR; + MPI_Count *tmp_len; + char **send_buf = NULL; + MPI_Request *requests, *send_req; + MPI_Datatype *recv_types, self_recv_type = MPI_DATATYPE_NULL; + MPI_Status *statuses, status; + MPI_Count sum, *srt_len = NULL; + int num_rtypes, nreqs; + MPI_Offset *srt_off = NULL; + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) +double curT = MPI_Wtime(); +#endif + +/* exchange recv_size info so that each process knows how much to + send to whom. */ + + MPI_Alltoall(recv_size, 1, MPI_COUNT, send_size, 1, MPI_COUNT, fd->comm); + + /* create derived datatypes for recv */ + + nprocs_send = 0; + nprocs_recv = 0; + sum = 0; + for (i = 0; i < nprocs; i++) { + sum += count[i]; + if (recv_size[i]) + nprocs_recv++; + if (send_size[i]) + nprocs_send++; + } + + recv_types = (MPI_Datatype *) NCI_Malloc((nprocs_recv + 1) * sizeof(MPI_Datatype)); + /* +1 to avoid a 0-size malloc */ + + tmp_len = NCI_Malloc(nprocs * sizeof(*tmp_len)); + j = 0; + for (i = 0; i < nprocs; i++) { + if (recv_size[i]) { + MPI_Datatype *dtype; + dtype = (i != myrank) ? (recv_types + j) : (&self_recv_type); + + if (partial_recv[i]) { + /* take care if the last off-len pair is a partial recv */ + MPI_Count k = start_pos[i] + count[i] - 1; + tmp_len[i] = others_req[i].lens[k]; + others_req[i].lens[k] = partial_recv[i]; + } +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Type_create_hindexed_c(count[i], + &(others_req[i].lens[start_pos[i]]), + &(others_req[i].mem_ptrs[start_pos[i]]), + MPI_BYTE, dtype); +#else + MPI_Type_create_hindexed(count[i], + &(others_req[i].lens[start_pos[i]]), + &(others_req[i].mem_ptrs[start_pos[i]]), + MPI_BYTE, dtype); +#endif + /* absolute displacements; use MPI_BOTTOM in recv */ + MPI_Type_commit(dtype); + if (i != myrank) + j++; + } + } + num_rtypes = j; /* number of non-self receive datatypes created */ + + /* To avoid a read-modify-write, check if there are holes in the + * data to be written. For this, merge the (sorted) offset lists + * others_req using a heap-merge. */ + +/* TODO: PNCIO_Heap_merge is expensive, borrow codes from ad_lustre_wrcoll.c to skip it when possible */ + + /* valgrind-detcted optimization: if there is no work on this process we do + * not need to search for holes */ + if (sum) { +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + double timing = MPI_Wtime(); +#endif + srt_off = (MPI_Offset *) NCI_Malloc(sum * sizeof(MPI_Offset)); + srt_len = NCI_Malloc(sum * sizeof(*srt_len)); + + PNCIO_Heap_merge(others_req, count, srt_off, srt_len, start_pos, + nprocs, nprocs_recv, sum); +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->write_timing[5] += MPI_Wtime() - timing; +#endif + } + + /* for partial recvs, restore original lengths */ + for (i = 0; i < nprocs; i++) + if (partial_recv[i]) { + MPI_Count k = start_pos[i] + count[i] - 1; + others_req[i].lens[k] = tmp_len[i]; + } + NCI_Free(tmp_len); + + /* check if there are any holes. If yes, must do read-modify-write. + * holes can be in three places. 'middle' is what you'd expect: the + * processes are operating on noncontigous data. But holes can also show + * up at the beginning or end of the file domain (see John Bent ROMIO REQ + * #835). Missing these holes would result in us writing more data than + * received by everyone else. */ + + *hole = 0; + if (sum) { + if (off != srt_off[0]) /* hole at the front */ + *hole = 1; + else { /* coalesce the sorted offset-length pairs */ + for (i = 1; i < sum; i++) { + if (srt_off[i] <= srt_off[0] + srt_len[0]) { + MPI_Count new_len = srt_off[i] + srt_len[i] - srt_off[0]; + if (new_len > srt_len[0]) + srt_len[0] = new_len; + } else + break; + } + if (i < sum || size != srt_len[0]) /* hole in middle or end */ + *hole = 1; + } + + NCI_Free(srt_off); + NCI_Free(srt_len); + } + + if (nprocs_recv) { + if (*hole) { + MPI_Offset r_len; + r_len = PNCIO_ReadContig(fd, write_buf, size, off); + if (r_len < 0) return r_len; + } + } + + if (fd->atomicity) { + /* nreqs is the number of Isend and Irecv to be posted */ + nreqs = (send_size[myrank]) ? (nprocs_send - 1) : nprocs_send; + requests = (MPI_Request *) NCI_Malloc((nreqs + 1) * sizeof(MPI_Request)); + send_req = requests; + } else { + nreqs = nprocs_send + nprocs_recv; + if (send_size[myrank]) /* NO send to and recv from self */ + nreqs -= 2; + requests = (MPI_Request *) NCI_Malloc((nreqs + 1) * sizeof(MPI_Request)); + /* +1 to avoid a 0-size malloc */ + + /* post receives */ + j = 0; + for (i = 0; i < nprocs; i++) { + if (recv_size[i] == 0) + continue; + if (i != myrank) { + MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i, 0, + fd->comm, requests + j); + j++; + } else if (buf_view.is_contig) { + /* sen/recv to/from self uses MPI_Unpack() */ +assert(self_recv_type != MPI_DATATYPE_NULL); +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count position=0; + MPI_Unpack_c((char *) buf + buf_idx[i], recv_size[i], &position, + write_buf, 1, self_recv_type, MPI_COMM_SELF); +#else + int position = 0; + assert(recv_size[i] < INT_MAX); + MPI_Unpack((char *) buf + buf_idx[i], (int)recv_size[i], &position, + write_buf, 1, self_recv_type, MPI_COMM_SELF); +#endif + buf_idx[i] += recv_size[i]; + } + } + send_req = requests + j; + } + +/* post sends. if buf_view.is_contig, data can be directly sent from + user buf at location given by buf_idx. else use send_buf. */ + + if (buf_view.is_contig) { + j = 0; + for (i = 0; i < nprocs; i++) + if (send_size[i] && i != myrank) { + assert(buf_idx[i] != -1); +#if MPI_VERSION >= 4 + MPI_Isend_c((char *) buf + buf_idx[i], send_size[i], + MPI_BYTE, i, 0, fd->comm, send_req + j); +#else + MPI_Isend((char *) buf + buf_idx[i], send_size[i], + MPI_BYTE, i, 0, fd->comm, send_req + j); +#endif + j++; + buf_idx[i] += send_size[i]; + } + } else if (nprocs_send) { + /* buftype is not contig */ + size_t msgLen = 0; + for (i = 0; i < nprocs; i++) + msgLen += send_size[i]; + send_buf = (char **) NCI_Malloc(nprocs * sizeof(char *)); + send_buf[0] = (char *) NCI_Malloc(msgLen * sizeof(char)); + for (i = 1; i < nprocs; i++) + send_buf[i] = send_buf[i - 1] + send_size[i - 1]; + + Fill_send_buffer(fd, buf, buf_view, send_buf, send_size, send_req, + sent_to_proc, nprocs, myrank, min_st_offset, fd_size, + fd_start, fd_end, send_buf_idx, curr_to_proc, + done_to_proc, iter); + + /* the send is done in Fill_send_buffer */ + } + + if (fd->atomicity) { + /* In atomic mode, we must use blocking receives to receive data in the + * same increasing order of MPI process rank IDs, + */ + j = 0; + for (i = 0; i < nprocs; i++) { + if (recv_size[i] == 0) + continue; + if (i != myrank) { + MPI_Recv(MPI_BOTTOM, 1, recv_types[j++], i, 0, + fd->comm, &status); + } else { + /* sen/recv to/from self uses MPI_Unpack() */ + char *ptr = (buf_view.is_contig) ? (char *) buf + buf_idx[i] : send_buf[i]; +assert(self_recv_type != MPI_DATATYPE_NULL); +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count position=0; + MPI_Unpack_c(ptr, recv_size[i], &position, write_buf, 1, self_recv_type, + MPI_COMM_SELF); +#else + int position = 0; + assert(recv_size[i] < INT_MAX); + MPI_Unpack(ptr, (int)recv_size[i], &position, write_buf, 1, self_recv_type, + MPI_COMM_SELF); +#endif + buf_idx[i] += recv_size[i]; + } + } + } else if (!buf_view.is_contig && recv_size[myrank]) { +assert(self_recv_type != MPI_DATATYPE_NULL); +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count position=0; + MPI_Unpack_c(send_buf[myrank], recv_size[myrank], &position, write_buf, 1, self_recv_type, + MPI_COMM_SELF); +#else + int position = 0; + assert(recv_size[myrank] < INT_MAX); + MPI_Unpack(send_buf[myrank], (int)recv_size[myrank], &position, write_buf, 1, self_recv_type, + MPI_COMM_SELF); +#endif + } + + for (i = 0; i < num_rtypes; i++) + MPI_Type_free(recv_types + i); + NCI_Free(recv_types); + + if (self_recv_type != MPI_DATATYPE_NULL) + MPI_Type_free(&self_recv_type); + +#ifdef HAVE_MPI_STATUSES_IGNORE + statuses = MPI_STATUSES_IGNORE; +#else + statuses = (MPI_Status *) NCI_Malloc(nreqs * sizeof(MPI_Status)); +#endif + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->write_timing[4] += MPI_Wtime() - curT; + curT = MPI_Wtime(); +#endif + MPI_Waitall(nreqs, requests, statuses); +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->write_timing[3] += MPI_Wtime() - curT; +#endif + +#ifndef HAVE_MPI_STATUSES_IGNORE + NCI_Free(statuses); +#endif + NCI_Free(requests); + if (!buf_view.is_contig && nprocs_send) { + NCI_Free(send_buf[0]); + NCI_Free(send_buf); + } + + return err; +} + +#define BUF_INCR \ +{ \ + while (buf_incr) { \ + size_in_buf = MIN(buf_incr, flat_buf_sz); \ + user_buf_idx += size_in_buf; \ + flat_buf_sz -= size_in_buf; \ + buf_incr -= size_in_buf; \ + if (buf_incr > 0 && flat_buf_sz == 0) { \ + flat_buf_idx++; \ + user_buf_idx = buf_view.off[flat_buf_idx]; \ + flat_buf_sz = buf_view.len[flat_buf_idx]; \ + } \ + } \ +} + +#define BUF_COPY \ +{ \ + while (size) { \ + size_in_buf = MIN(size, flat_buf_sz); \ + memcpy(&(send_buf[p][send_buf_idx[p]]), \ + ((char *) buf) + user_buf_idx, size_in_buf); \ + send_buf_idx[p] += size_in_buf; \ + user_buf_idx += size_in_buf; \ + flat_buf_sz -= size_in_buf; \ + size -= size_in_buf; \ + buf_incr -= size_in_buf; \ + if (size > 0 && flat_buf_sz == 0) { \ + flat_buf_idx++; \ + user_buf_idx = buf_view.off[flat_buf_idx]; \ + flat_buf_sz = buf_view.len[flat_buf_idx]; \ + } \ + } \ + BUF_INCR \ +} + +static +void Fill_send_buffer(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, char **send_buf, + MPI_Count * send_size, + MPI_Request * requests, MPI_Count * sent_to_proc, + int nprocs, int myrank, + MPI_Offset min_st_offset, MPI_Offset fd_size, + MPI_Offset * fd_start, MPI_Offset * fd_end, + MPI_Count * send_buf_idx, MPI_Count * curr_to_proc, + MPI_Count * done_to_proc, int iter) +{ +/* this function is only called if buftype is not contig */ + + int p, jj; + MPI_Offset flat_buf_idx, flat_buf_sz, size_in_buf, buf_incr, size; + MPI_Offset off, len, rem_len, user_buf_idx; + +/* curr_to_proc[p] = amount of data sent to proc. p that has already + been accounted for so far + done_to_proc[p] = amount of data already sent to proc. p in + previous iterations + user_buf_idx = current location in user buffer + send_buf_idx[p] = current location in send_buf of proc. p */ + + for (MPI_Count i = 0; i < nprocs; i++) { + send_buf_idx[i] = curr_to_proc[i] = 0; + done_to_proc[i] = sent_to_proc[i]; + } + jj = 0; + + user_buf_idx = buf_view.off[0]; + flat_buf_idx = 0; + flat_buf_sz = buf_view.len[0]; + + /* flat_buf_idx = current index into flattened buftype + * flat_buf_sz = size of current contiguous component in + * flattened buf */ + + for (MPI_Count i = 0; i < fd->flat_file.count; i++) { + off = fd->flat_file.off[i]; + rem_len = fd->flat_file.len[i]; + + /*this request may span the file domains of more than one process */ + while (rem_len != 0) { + len = rem_len; + /* NOTE: len value is modified by PNCIO_Calc_aggregator() to be no + * longer than the single region that processor "p" is responsible + * for. + */ + p = PNCIO_Calc_aggregator(fd, off, min_st_offset, &len, fd_size, fd_end); + + if (send_buf_idx[p] < send_size[p]) { + if (curr_to_proc[p] + len > done_to_proc[p]) { + if (done_to_proc[p] > curr_to_proc[p]) { + size = MIN(curr_to_proc[p] + len - + done_to_proc[p], send_size[p] - send_buf_idx[p]); + buf_incr = done_to_proc[p] - curr_to_proc[p]; + BUF_INCR + buf_incr = curr_to_proc[p] + len - done_to_proc[p]; + /* ok to cast: bounded by cb buffer size */ + curr_to_proc[p] = done_to_proc[p] + size; + BUF_COPY + } else { + size = MIN(len, send_size[p] - send_buf_idx[p]); + buf_incr = len; + curr_to_proc[p] += size; + BUF_COPY + } + if (send_buf_idx[p] == send_size[p] && p != myrank) { +#if MPI_VERSION >= 4 + MPI_Isend_c(send_buf[p], send_size[p], MPI_BYTE, p, + 0, fd->comm, &requests[jj++]); +#else + MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p, + 0, fd->comm, &requests[jj++]); +#endif + } + } else { + curr_to_proc[p] += len; + buf_incr = len; + BUF_INCR + } + } else { + buf_incr = len; + BUF_INCR + } + off += len; + rem_len -= len; + } + } + for (int i = 0; i < nprocs; i++) { + if (send_size[i]) { + sent_to_proc[i] = curr_to_proc[i]; + } + } +} diff --git a/src/drivers/pncio/pncio_write_str.c b/src/drivers/pncio/pncio_write_str.c new file mode 100644 index 0000000000..6cc1f555f0 --- /dev/null +++ b/src/drivers/pncio/pncio_write_str.c @@ -0,0 +1,328 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + +#define BUFFERED_WRITE { \ + if (req_off >= writebuf_off + writebuf_len) { \ + if (writebuf_len) { \ + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, \ + writebuf_off); \ + if (!fd->atomicity && fd->hints->romio_ds_write == PNCIO_HINT_DISABLE) \ + PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ + if (w_len < 0) goto fn_exit; \ + total_w_len += w_len; \ + } \ + writebuf_off = req_off; \ + writebuf_len = MIN(max_bufsize,end_offset-writebuf_off+1); \ + if (!fd->atomicity && fd->hints->romio_ds_write == PNCIO_HINT_DISABLE) \ + PNCIO_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ + r_len = PNCIO_ReadContig(fd, writebuf, writebuf_len, writebuf_off); \ + if (r_len < 0) goto fn_exit; \ + } \ + write_sz = (MPI_Aint)MIN(req_len, writebuf_off+writebuf_len-req_off); \ + memcpy(writebuf+req_off-writebuf_off, (char*)buf +userbuf_off, write_sz); \ + while (write_sz != req_len) { \ + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off); \ + if (!fd->atomicity && fd->hints->romio_ds_write == PNCIO_HINT_DISABLE) \ + PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ + if (w_len < 0) goto fn_exit; \ + total_w_len += w_len; \ + req_len -= write_sz; \ + userbuf_off += write_sz; \ + writebuf_off += writebuf_len; \ + writebuf_len = MIN(max_bufsize,end_offset-writebuf_off+1); \ + if (!fd->atomicity && fd->hints->romio_ds_write == PNCIO_HINT_DISABLE) \ + PNCIO_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ + r_len = PNCIO_ReadContig(fd, writebuf, writebuf_len, writebuf_off); \ + if (r_len < 0) goto fn_exit; \ + write_sz = MIN(req_len, writebuf_len); \ + memcpy(writebuf, (char *)buf + userbuf_off, write_sz); \ + } \ +} + + +MPI_Offset PNCIO_GEN_WriteStrided(PNCIO_File *fd, + const void *buf, + PNCIO_View buf_view, + MPI_Offset offset) +{ + +/* offset is in units of etype relative to the filetype. */ + + char *writebuf = NULL; + int i, j, k, st_index = 0; + MPI_Aint writebuf_len, max_bufsize, write_sz, bufsize; + MPI_Offset i_offset, sum, num, size, abs_off_in_filetype=0; + MPI_Offset userbuf_off, off, req_off, disp, end_offset=0; + MPI_Offset writebuf_off, start_off, new_bwr_size, new_fwr_size; + MPI_Offset st_fwr_size, fwr_size = 0, bwr_size, req_len; + MPI_Offset r_len, w_len, total_w_len=0; + + /* Contiguous both in buftype and filetype should have been handled in a + * call to PNCIO_WriteContig() earlier. + */ + assert(!(buf_view.is_contig && fd->flat_file.is_contig)); + + if (fd->hints->romio_ds_write == PNCIO_HINT_DISABLE) { + /* If user has disabled data sieving on reads, use naive approach + * instead. + */ + return PNCIO_GEN_WriteStrided_naive(fd, buf, buf_view, offset); + } + +// printf("%s at %d: offset=%lld\n",__func__,__LINE__, offset); + +/* PnetCDF always set these 3 conditions */ +assert(fd->filetype == MPI_BYTE); +assert(fd->flat_file.size == buf_view.size); +if (fd->flat_file.count > 0) assert(offset == 0); /* not whole file visible */ + + bufsize = buf_view.size; + + /* get max_bufsize from the info object. */ + max_bufsize = fd->hints->ind_wr_buffer_size; + + if (!buf_view.is_contig && fd->flat_file.is_contig) { + /* noncontiguous in memory, contiguous in file. */ + + off = fd->disp + offset; +assert(fd->disp == 0); + if (fd->flat_file.count > 0) off += fd->flat_file.off[0]; + + start_off = off; + end_offset = off + bufsize - 1; + writebuf_off = off; + writebuf = (char *) NCI_Malloc(max_bufsize); + writebuf_len = MIN(max_bufsize, end_offset - writebuf_off + 1); + + /* if atomicity is true or data sieving is not disable, lock the region + * to be accessed */ + if (fd->atomicity || fd->hints->romio_ds_write != PNCIO_HINT_DISABLE) + PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + + for (i = 0; i < buf_view.count; i++) { + userbuf_off = buf_view.off[i]; + req_off = off; + req_len = buf_view.len[i]; + + /* BUFFERED_WRITE_WITHOUT_READ does neither read-modify-write nor + * file lock + */ + if (req_off >= writebuf_off + writebuf_len) { + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, + writebuf_off); + if (w_len < 0) goto fn_exit; + total_w_len += w_len; + writebuf_off = req_off; + writebuf_len = MIN(max_bufsize,end_offset-writebuf_off+1); + } + write_sz = MIN(req_len, writebuf_off + writebuf_len - req_off); + memcpy(writebuf+req_off-writebuf_off, (char*)buf +userbuf_off, + write_sz); + while (write_sz != req_len) { + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, + writebuf_off); + if (w_len < 0) goto fn_exit; + total_w_len += w_len; + req_len -= write_sz; + userbuf_off += write_sz; + writebuf_off += writebuf_len; + writebuf_len = MIN(max_bufsize,end_offset-writebuf_off+1); + write_sz = MIN(req_len, writebuf_len); + memcpy(writebuf, (char *)buf + userbuf_off, write_sz); + } + + off += buf_view.len[i]; + } + + /* write the buffer out finally */ + if (writebuf_len) { + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off); + if (w_len >= 0) total_w_len += w_len; + } + else + w_len = 0; + + if (fd->atomicity || fd->hints->romio_ds_write != PNCIO_HINT_DISABLE) + PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); + + if (w_len < 0) + goto fn_exit; + } + else { /* noncontiguous in file */ + MPI_Offset size_in_filetype = offset; + + disp = fd->disp; +assert(fd->disp == 0); + + sum = 0; + for (i = 0; i < fd->flat_file.count; i++) { + sum += fd->flat_file.len[i]; + if (sum > size_in_filetype) { + st_index = i; + fwr_size = sum - size_in_filetype; + abs_off_in_filetype = fd->flat_file.off[i] + + size_in_filetype - (sum - fd->flat_file.len[i]); + break; + } + } + + /* abs. offset in bytes in the file */ + offset = disp + abs_off_in_filetype; + + start_off = offset; +assert(offset == abs_off_in_filetype); + +// printf("%s at %d: start_off=%lld abs_off_in_filetype=%lld\n",__func__,__LINE__,start_off,abs_off_in_filetype); + + /* Write request is within single flat_file contig block. This could + * happen, for example, with subarray types that are actually fairly + * contiguous. + */ + if (buf_view.is_contig && bufsize <= fwr_size) { + /* though MPI api has an integer 'count' parameter, derived + * datatypes might describe more bytes than can fit into an integer. + * if we've made it this far, we can pass a count of original + * datatypes, instead of a count of bytes (which might overflow) + * Other WriteContig calls in this path are operating on data + * sieving buffer */ + PNCIO_WRITE_LOCK(fd, offset, SEEK_SET, bufsize); + w_len = PNCIO_WriteContig(fd, buf, buf_view.size, offset); + if (w_len > 0) total_w_len += w_len; + PNCIO_UNLOCK(fd, offset, SEEK_SET, bufsize); + + goto fn_exit; + } + + /* Calculate end_offset, the last byte-offset that will be accessed. + * e.g., if start_offset=0 and 100 bytes to be write, end_offset=99 */ + + st_fwr_size = fwr_size; + j = st_index; + fwr_size = MIN(fwr_size, bufsize); + i_offset = fwr_size; + end_offset = offset + fwr_size - 1; + while (i_offset < bufsize) { + j++; + fwr_size = MIN(fd->flat_file.len[j], bufsize - i_offset); + i_offset += fwr_size; + end_offset = disp + fd->flat_file.off[j] + fwr_size - 1; + } + + /* if atomicity is true or data sieving is not disable, lock the region + * to be accessed */ + if (fd->atomicity || fd->hints->romio_ds_write != PNCIO_HINT_DISABLE) + PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + + writebuf_off = 0; + writebuf_len = 0; + writebuf = (char *) NCI_Malloc(max_bufsize); + memset(writebuf, -1, max_bufsize); + + if (buf_view.is_contig && !fd->flat_file.is_contig) { + /* contiguous in memory, noncontiguous in file should be the most + * common case. + */ + i_offset = 0; + j = st_index; + off = offset; + fwr_size = MIN(st_fwr_size, bufsize); + while (i_offset < bufsize) { + if (fwr_size) { + req_off = off; + req_len = fwr_size; + userbuf_off = i_offset; + BUFFERED_WRITE; + } + + i_offset += fwr_size; + if (i_offset >= bufsize) break; + + if (off + fwr_size < disp + fd->flat_file.off[j] + + fd->flat_file.len[j]) + off += fwr_size; /* off is incremented by fwr_size. */ + else { + j++; +assert(j < fd->flat_file.count); + off = disp + fd->flat_file.off[j]; + fwr_size = MIN(fd->flat_file.len[j], + bufsize - i_offset); + } + } + } else { + /* noncontiguous in memory as well as in file */ + k = num = 0; + i_offset = buf_view.off[0]; + j = st_index; + off = offset; + fwr_size = st_fwr_size; + bwr_size = buf_view.len[0]; + + while (num < bufsize) { + size = MIN(fwr_size, bwr_size); + if (size) { + req_off = off; + req_len = size; + userbuf_off = i_offset; + BUFFERED_WRITE; + } + + num += size; + if (num >= bufsize) break; + + new_fwr_size = fwr_size; + new_bwr_size = bwr_size; + + if (size == fwr_size) { + j++; +assert(j < fd->flat_file.count); + off = disp + fd->flat_file.off[j]; + new_fwr_size = fd->flat_file.len[j]; + if (size != bwr_size) { + i_offset += size; + new_bwr_size -= size; + } + } + + if (size == bwr_size) { + /* reached end of contiguous block in memory */ + + k++; +assert(k < buf_view.count); + i_offset = buf_view.off[k]; + new_bwr_size = buf_view.len[k]; + if (size != fwr_size) { + off += size; + new_fwr_size -= size; + } + } + fwr_size = new_fwr_size; + bwr_size = new_bwr_size; + } + } + + /* write the buffer out finally */ + if (writebuf_len) { + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off); + if (!fd->atomicity && fd->hints->romio_ds_write == PNCIO_HINT_DISABLE) + PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); + if (w_len < 0) goto fn_exit; + total_w_len += w_len; + } + if (fd->atomicity || fd->hints->romio_ds_write != PNCIO_HINT_DISABLE) + PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); + } + +fn_exit: + if (writebuf != NULL) + NCI_Free(writebuf); + + return total_w_len; +} diff --git a/src/drivers/pncio/pncio_write_str_naive.c b/src/drivers/pncio/pncio_write_str_naive.c new file mode 100644 index 0000000000..9ac11de2bc --- /dev/null +++ b/src/drivers/pncio/pncio_write_str_naive.c @@ -0,0 +1,249 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + +MPI_Offset PNCIO_GEN_WriteStrided_naive(PNCIO_File *fd, + const void *buf, + PNCIO_View buf_view, + MPI_Offset offset) +{ + int b_index; + MPI_Count bufsize; + + /* bwr == buffer write; fwr == file write */ + MPI_Offset bwr_size, fwr_size = 0, sum, size_in_filetype, size; + MPI_Offset abs_off_in_filetype = 0, req_len, userbuf_off; + MPI_Offset off, req_off, disp, end_offset = 0, start_off; + MPI_Offset w_len, total_w_len=0; + +/* PnetCDF always sets fd->filetype == MPI_BYTE */ +assert(fd->filetype == MPI_BYTE); + + /* Contiguous both in buftype and filetype should have been handled in a + * call to PNCIO_WriteContig() earlier. + */ + assert(!(buf_view.is_contig && fd->flat_file.is_contig)); + + bufsize = buf_view.size; + + if (!buf_view.is_contig && fd->flat_file.is_contig) { + /* noncontiguous in memory, contiguous in file. */ + + off = fd->disp + offset; + + start_off = off; + end_offset = off + bufsize - 1; + + /* if atomicity is true, lock (exclusive) the region to be accessed */ + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + + /* for each region in the buffer, grab the data and put it in place */ + for (b_index = 0; b_index < buf_view.count; b_index++) { + userbuf_off = buf_view.off[b_index]; + req_off = off; + req_len = buf_view.len[b_index]; + + w_len = PNCIO_WriteContig(fd, (char *) buf + userbuf_off, + req_len, req_off); + if (w_len < 0) return w_len; + total_w_len += w_len; + + /* off is (potentially) used to save the final offset later */ + off += buf_view.len[b_index]; + } + + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + } + else { /* noncontiguous in file */ + int f_index, st_index = 0; + MPI_Offset st_fwr_size; + + /* First we're going to calculate a set of values for use in all + * the noncontiguous in file cases: + * start_off - starting byte position of data in file + * end_offset - last byte offset to be accessed in the file + * st_index - index of block in first filetype that we will be + * starting in (?) + * st_fwr_size - size of the data in the first filetype block + * that we will write (accounts for being part-way + * into writing this block of the filetype + */ + + disp = fd->disp; + +/* noncontiguous in fileview, disp and offset should be 0 for PnetCDF */ +assert(fd->disp == 0); +assert(offset == 0); + + size_in_filetype = offset; + + sum = 0; + for (f_index = 0; f_index < fd->flat_file.count; f_index++) { + sum += fd->flat_file.len[f_index]; + if (sum > size_in_filetype) { + st_index = f_index; + fwr_size = sum - size_in_filetype; + abs_off_in_filetype = fd->flat_file.off[f_index] + + size_in_filetype - (sum - fd->flat_file.len[f_index]); + break; + } + } + + /* abs. offset in bytes in the file */ + start_off = disp + abs_off_in_filetype; + + st_fwr_size = fwr_size; + + /* start_off, st_index, and st_fwr_size are + * all calculated at this point + */ + + /* Calculate end_offset, the last byte-offset that will be accessed. + * e.g., if start_off=0 and 100 bytes to be written, end_offset=99 + */ + f_index = st_index; + fwr_size = MIN(st_fwr_size, bufsize); + userbuf_off = fwr_size; + end_offset = start_off + fwr_size - 1; + while (userbuf_off < bufsize) { + f_index++; + fwr_size = MIN(fd->flat_file.len[f_index], + bufsize - userbuf_off); + userbuf_off += fwr_size; + end_offset = disp + fd->flat_file.off[f_index] + fwr_size - 1; + } + + /* End of calculations. At this point the following values have + * been calculated and are ready for use: + * - start_off + * - end_offset + * - st_index + * - st_fwr_size + */ + + /* if atomicity is true, lock (exclusive) the region to be accessed */ + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + + if (buf_view.is_contig && !fd->flat_file.is_contig) { + /* contiguous in memory, noncontiguous in file. should be the + * most common case. + */ + + userbuf_off = 0; + f_index = st_index; + off = start_off; + fwr_size = MIN(st_fwr_size, bufsize); + + /* while there is still space in the buffer, write more data */ + while (userbuf_off < bufsize) { + if (fwr_size) { + /* TYPE_UB and TYPE_LB can result in + * fwr_size = 0. save system call in such cases */ + req_off = off; + req_len = fwr_size; + + w_len = PNCIO_WriteContig(fd, (char *) buf + userbuf_off, + req_len, req_off); + if (w_len < 0) return w_len; + total_w_len += w_len; + } + userbuf_off += fwr_size; + if (userbuf_off >= bufsize) break; + + if (off + fwr_size < disp + fd->flat_file.off[f_index] + + fd->flat_file.len[f_index]) { + /* important that this value be correct, as it is + * used to set the offset in the fd near the end of + * this function. + */ + off += fwr_size; + } + /* did not reach end of contiguous block in filetype. + * no more I/O needed. off is incremented by fwr_size. + */ + else { + f_index++; +assert(f_index < fd->flat_file.count); + off = disp + fd->flat_file.off[f_index]; + fwr_size = MIN(fd->flat_file.len[f_index], + bufsize - userbuf_off); + } + } + } else { + MPI_Offset i_offset, tmp_bufsize = 0; + /* noncontiguous in memory as well as in file */ + + b_index = 0; + i_offset = buf_view.off[0]; + f_index = st_index; + off = start_off; + fwr_size = st_fwr_size; + bwr_size = buf_view.len[0]; + + /* while we haven't read size * count bytes, keep going */ + while (tmp_bufsize < bufsize) { + MPI_Offset new_bwr_size = bwr_size, new_fwr_size = fwr_size; + + size = MIN(fwr_size, bwr_size); + /* keep max of a single read amount <= INT_MAX */ + size = MIN(size, INT_MAX); + + if (size) { + req_off = off; + req_len = size; + userbuf_off = i_offset; + + w_len = PNCIO_WriteContig(fd, (char *) buf + userbuf_off, + req_len, req_off); + if (w_len < 0) return w_len; + total_w_len += w_len; + } + + tmp_bufsize += size; + if (tmp_bufsize >= bufsize) break; + + if (size == fwr_size) { + f_index++; +assert(f_index < fd->flat_file.count); + off = disp + fd->flat_file.off[f_index]; + new_fwr_size = fd->flat_file.len[f_index]; + if (size != bwr_size) { + i_offset += size; + new_bwr_size -= size; + } + } + + if (size == bwr_size) { + /* reached end of contiguous block in memory */ + b_index++; +assert(b_index < buf_view.count); + i_offset = buf_view.off[b_index]; + new_bwr_size = buf_view.len[b_index]; + if (size != fwr_size) { + off += size; + new_fwr_size -= size; + } + } + fwr_size = new_fwr_size; + bwr_size = new_bwr_size; + } + } + + /* unlock the file region if we locked it */ + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); + + } /* end of (else noncontiguous in file) */ + + return total_w_len; +} diff --git a/src/include/dispatch.h b/src/include/dispatch.h index 8276a4245a..577aa8b998 100644 --- a/src/include/dispatch.h +++ b/src/include/dispatch.h @@ -44,10 +44,18 @@ typedef enum { API_VARM } NC_api; +typedef struct { + int ref_count ; /* reference count */ + int num_nodes; /* number of unique compute nodes */ + int *ids; /* [nprocs] node ID of each MPI process */ +} PNCIO_node_ids; + struct PNC_driver { /* APIs manipulate files */ - int (*create)(MPI_Comm, const char*, int, int, MPI_Info, void**); - int (*open)(MPI_Comm, const char*, int, int, MPI_Info, void**); + int (*create)(MPI_Comm, const char*, int, int, int, MPI_Info, + PNCIO_node_ids, void**); + int (*open)(MPI_Comm, const char*, int, int, int, MPI_Info, + PNCIO_node_ids, void**); int (*close)(void*); int (*enddef)(void*); int (*_enddef)(void*,MPI_Offset,MPI_Offset,MPI_Offset,MPI_Offset); @@ -153,6 +161,8 @@ extern PNC_driver* ncfoo_inq_driver(void); extern PNC_driver* ncbbio_inq_driver(void); +extern PNC_driver* ncchkio_inq_driver(void); + extern int PNC_check_id(int ncid, PNC **pncp); #endif /* H_PNC_DISPATCH */ diff --git a/src/include/pnc_debug.h b/src/include/pnc_debug.h index 9762448964..cda7bb4f4c 100644 --- a/src/include/pnc_debug.h +++ b/src/include/pnc_debug.h @@ -45,6 +45,16 @@ } #ifdef PNETCDF_DEBUG + +/* PNETCDF_VERBOSE_DEBUG_MODE environment variable can be used to print the + * location in the source code where the error code is originated, no matter + * the error is intended or not. This run-time environment variable only takes + * effect only when PnetCDF is configure with debug mode, i.e. --enable-debug + * is set at the configure command line. This feature is mainly for PnetCDF + * developers, who are warned that enabling this mode may result in a lot of + * debugging messages printed in stderr. + */ + #define DEBUG_RETURN_ERROR(err) { \ char *_env_str = getenv("PNETCDF_VERBOSE_DEBUG_MODE"); \ if (_env_str != NULL && *_env_str != '0') { \ @@ -55,6 +65,27 @@ } \ return err; \ } +#define DEBUG_FOPEN_ERROR(err) { \ + if (ncp->ina_comm != MPI_COMM_NULL) MPI_Comm_free(&ncp->ina_comm); \ + char *_env_str = getenv("PNETCDF_VERBOSE_DEBUG_MODE"); \ + if (_env_str != NULL && *_env_str != '0') { \ + int _rank; \ + MPI_Comm_rank(MPI_COMM_WORLD, &_rank); \ + fprintf(stderr, "Rank %d: %s error at line %d of %s in %s\n", \ + _rank,ncmpi_strerrno(err),__LINE__,__func__,__FILE__); \ + } \ + return err; \ +} +#define DEBUG_RETURN_ERROR_MSG(err, msg) { \ + char *_env_str = getenv("PNETCDF_VERBOSE_DEBUG_MODE"); \ + if (_env_str != NULL && *_env_str != '0') { \ + int _rank; \ + MPI_Comm_rank(MPI_COMM_WORLD, &_rank); \ + fprintf(stderr, "Rank %d: %s error at line %d of %s in %s (%s)\n", \ + _rank,ncmpi_strerrno(err),__LINE__,__func__,__FILE__, msg); \ + } \ + return err; \ +} #define DEBUG_ASSIGN_ERROR(status, err) { \ char *_env_str = getenv("PNETCDF_VERBOSE_DEBUG_MODE"); \ if (_env_str != NULL && *_env_str != '0') { \ @@ -76,6 +107,11 @@ } #else #define DEBUG_RETURN_ERROR(err) return err; +#define DEBUG_RETURN_ERROR_MSG(err, msg) return err; +#define DEBUG_FOPEN_ERROR(err) { \ + if (ncp->ina_comm != MPI_COMM_NULL) MPI_Comm_free(&ncp->ina_comm); \ + return err; \ +} #define DEBUG_ASSIGN_ERROR(status, err) status = err; #define DEBUG_TRACE_ERROR(err) #endif diff --git a/src/include/pnetcdf.h.in b/src/include/pnetcdf.h.in index b363276b6c..198fabbe11 100644 --- a/src/include/pnetcdf.h.in +++ b/src/include/pnetcdf.h.in @@ -16,6 +16,7 @@ #define PNETCDF_VERSION_MAJOR @PNETCDF_VERSION_MAJOR@ #define PNETCDF_VERSION_MINOR @PNETCDF_VERSION_MINOR@ #define PNETCDF_VERSION_SUB @PNETCDF_VERSION_SUB@ +#define PNETCDF_VERSION_PRE "@PNETCDF_VERSION_PRE@" #define PNETCDF_RELEASE_DATE "DIST_DATE" /* List of PnetCDF features enabled/disabled at configure time. @@ -657,6 +658,7 @@ by the desired type. */ #define NC_EBADLOG (-238) /**< Unrecognized log file format */ #define NC_EFLUSHED (-239) /**< Nonblocking request has already been flushed. It is too late to cancel */ #define NC_EADIOS (-240) /**< unknown ADIOS error */ +#define NC_EFSTYPE (-241) /**< Invalid file system type */ /* add new error here */ /* header inconsistency errors start from -250 */ @@ -684,9 +686,10 @@ by the desired type. */ #define NC_EMULTIDEFINE_VAR_FILL_MODE (-271) /**< inconsistent variable fill mode */ #define NC_EMULTIDEFINE_VAR_FILL_VALUE (-272) /**< inconsistent variable fill value */ #define NC_EMULTIDEFINE_CMODE (-273) /**< inconsistent file create modes among processes */ +#define NC_EMULTIDEFINE_HINTS (-274) /**< inconsistent I/O hints among processes */ #define NC_EMULTIDEFINE_FIRST NC_EMULTIDEFINE -#define NC_EMULTIDEFINE_LAST NC_EMULTIDEFINE_CMODE +#define NC_EMULTIDEFINE_LAST NC_EMULTIDEFINE_HINTS /* backward compatible with PnetCDF 1.3.1 and earlier */ #define NC_ECMODE NC_EMULTIDEFINE_OMODE @@ -791,6 +794,18 @@ extern int ncmpi_def_var(int ncid, const char *name, nc_type xtype, int ndims, const int *dimidsp, int *varidp); +#define NC_FILTER_NONE 0 +#define NC_FILTER_DEFLATE 2 +#define NC_FILTER_SZ 3 +extern int +ncmpi_var_set_chunk (int ncid, int varid, int *chunk_dim); +extern int +ncmpi_var_get_chunk (int ncid, int varid, int *chunk_dim); +extern int +ncmpi_var_set_filter (int ncid, int varid, int filter); +extern int +ncmpi_var_get_filter (int ncid, int varid, int *filter); + extern int ncmpi_rename_dim(int ncid, int dimid, const char *name); diff --git a/src/libs/Makefile.am b/src/libs/Makefile.am index a932f20f56..e93cf60862 100644 --- a/src/libs/Makefile.am +++ b/src/libs/Makefile.am @@ -23,6 +23,7 @@ libpnetcdf_la_SOURCES = libpnetcdf_la_LIBADD += ../dispatchers/libdispatchers.la libpnetcdf_la_LIBADD += ../drivers/common/libcommon.la libpnetcdf_la_LIBADD += ../drivers/ncmpio/libncmpio.la +libpnetcdf_la_LIBADD += ../drivers/pncio/libpncio.la if BUILD_DRIVER_FOO libpnetcdf_la_LIBADD += ../drivers/ncfoo/libncfoo.la endif @@ -32,6 +33,9 @@ endif if ENABLE_BURST_BUFFER libpnetcdf_la_LIBADD += ../drivers/ncbbio/libncbbio.la endif +if ENABLE_CHUNKING + libpnetcdf_la_LIBADD += ../drivers/ncchunkio/libncchkio.la +endif if ENABLE_ADIOS libpnetcdf_la_LIBADD += ../drivers/ncadios/libncadios.la endif @@ -71,6 +75,9 @@ endif ../drivers/ncmpio/libncmpio.la: set -e; cd ../drivers/ncmpio && $(MAKE) $(MFLAGS) +../drivers/pncio/libpncio.la: + set -e; cd ../drivers/pncio && $(MAKE) $(MFLAGS) + ../drivers/ncncio/libncncio.la: set -e; cd ../drivers/ncncio && $(MAKE) $(MFLAGS) diff --git a/src/utils/ncmpidiff/Makefile.am b/src/utils/ncmpidiff/Makefile.am index 0cbc72adb5..0abd162c27 100644 --- a/src/utils/ncmpidiff/Makefile.am +++ b/src/utils/ncmpidiff/Makefile.am @@ -11,8 +11,11 @@ AM_CPPFLAGS += -I$(top_builddir)/src/include bin_PROGRAMS = ncmpidiff cdfdiff +noinst_LTLIBRARIES = libncmpidiff_core.la +libncmpidiff_core_la_SOURCES = ncmpidiff_core.c + ncmpidiff_SOURCES = ncmpidiff.c -ncmpidiff_LDADD = $(top_builddir)/src/libs/libpnetcdf.la +ncmpidiff_LDADD = $(top_builddir)/src/libs/libpnetcdf.la $(noinst_LTLIBRARIES) ncmpidiff_LDADD += @NETCDF4_LDFLAGS@ @ADIOS_LDFLAGS@ @NETCDF4_LIBS@ @ADIOS_LIBS@ cdfdiff_SOURCES = cdfdiff.c @@ -25,6 +28,8 @@ $(top_builddir)/src/libs/libpnetcdf.la: dist_man_MANS = ncmpidiff.1 cdfdiff.1 +EXTRA_DIST = ncmpidiff_core.h + CLEANFILES = core core.* *.gcda *.gcno *.gcov gmon.out dist-hook: diff --git a/src/utils/ncmpidiff/cdfdiff.c b/src/utils/ncmpidiff/cdfdiff.c index be19bc0c7e..2045efe65d 100644 --- a/src/utils/ncmpidiff/cdfdiff.c +++ b/src/utils/ncmpidiff/cdfdiff.c @@ -187,9 +187,9 @@ struct vspec { /*----< get_var_names() >-----------------------------------------------------*/ static void -get_var_names(char *optarg, struct vspec* vspecp) +get_var_names(char *opt_arg, struct vspec* vspecp) { - char *cp=optarg, **cpp; + char *cp=opt_arg, **cpp; int nvars = 1; /* compute number of variable names in comma-delimited list */ @@ -203,7 +203,7 @@ get_var_names(char *optarg, struct vspec* vspecp) cpp = vspecp->names; /* copy variable names into list */ - for (cp = strtok(optarg, ","); + for (cp = strtok(opt_arg, ","); cp != NULL; cp = strtok((char *) NULL, ",")) { @@ -237,11 +237,11 @@ get_type(int type) /*----< main() >--------------------------------------------------------------*/ int main(int argc, char **argv) { + /* int verbose; is defined as a locally global variable in ncvalidator.c */ extern char *optarg; extern int optind; - char *str, *ptr; size_t nbytes; - int i, j, k, m, n, c, err, verbose, quiet, isDiff; + int i, j, k, m, n, c, err, quiet, isDiff; int fd[2], nvars[2], ndims[2], nattrs[2], check_tolerance; int cmp_nvars, check_header, check_variable_list, check_entire_file; long long numVarDIFF=0, numHeadDIFF=0, numDIFF; @@ -264,7 +264,8 @@ int main(int argc, char **argv) var_list.nvars = 0; check_tolerance = 0; - while ((c = getopt(argc, argv, "bhqv:t:")) != -1) + while ((c = getopt(argc, argv, "bhqv:t:")) != -1) { + char *str, *ptr; switch(c) { case 'h': /* compare header only */ check_header = 1; @@ -301,6 +302,7 @@ int main(int argc, char **argv) usage(argv[0]); break; } + } /* quiet mode overwrites verbose */ if (quiet) verbose = 0; @@ -394,9 +396,9 @@ int main(int argc, char **argv) /* compare file header */ if (check_header) { - NC_attr *attr[2]; - NC_dim *dim[2]; - NC_var *var[2]; + NC_attr *attr[2]={NULL, NULL}; + NC_dim *dim[2]={NULL, NULL}; + NC_var *var[2]={NULL, NULL}; /* compare number of dimensions defined */ if (ndims[0] != ndims[1]) { diff --git a/src/utils/ncmpidiff/ncmpidiff.c b/src/utils/ncmpidiff/ncmpidiff.c index 8f1fe4f203..c61d6c4acf 100644 --- a/src/utils/ncmpidiff/ncmpidiff.c +++ b/src/utils/ncmpidiff/ncmpidiff.c @@ -23,27 +23,15 @@ #include #include #include -#include -#include /* INFINITY */ +#include /* stat() */ +#include /* stat() */ +#include /* stat() */ +#include /* errno */ +#include /* INFINITY */ #include #include - -#ifndef ubyte -#define ubyte unsigned char -#endif -#ifndef ushort -#define ushort unsigned short -#endif -#ifndef uint -#define uint unsigned int -#endif -#ifndef int64 -#define int64 long long -#endif -#ifndef uint64 -#define uint64 unsigned long long -#endif +#include #define OOM_ERROR { \ @@ -51,213 +39,6 @@ exit(1); \ } -#define HANDLE_ERROR { \ - if (err != NC_NOERR) { \ - fprintf(stderr, "Error at line %d of file %s (%s)\n", __LINE__, \ - __FILE__, ncmpi_strerror(err)); \ - MPI_Abort(MPI_COMM_WORLD, -1); \ - exit(-1); \ - } \ -} - -#define CHECK_GLOBAL_ATT_DIFF_CHAR { \ - int pos; \ - char *b1 = (char *)calloc((attlen[0] + 1) * 2, sizeof(char)); \ - char *b2 = b1 + attlen[0] + 1; \ - if (!b1) OOM_ERROR \ - err = ncmpi_get_att_text(ncid[0], NC_GLOBAL, name[0], b1); \ - HANDLE_ERROR \ - err = ncmpi_get_att_text(ncid[1], NC_GLOBAL, name[0], b2); \ - HANDLE_ERROR \ - for (pos=0; pos= 0) ? (x) : (-x) -#define UABS(x) (x) - -#define CHECK_VAR_DIFF(type, func, xabs) { \ - int pos, isDiff, worst = -1; \ - type *b1, *b2; \ - b1 = (type *)calloc(varsize * 2, sizeof(type)); \ - if (!b1) OOM_ERROR \ - b2 = b1 + varsize; \ - err = ncmpi_get_vara_##func(ncid[0], varid1, start, shape, b1); \ - HANDLE_ERROR \ - err = ncmpi_get_vara_##func(ncid[1], varid2, start, shape, b2); \ - HANDLE_ERROR \ - if (!check_tolerance) { \ - for (pos=0; pos abs_b2) ? abs_b1 : abs_b2; \ - diff = b1[pos] - b2[pos]; \ - diff = (diff >= 0) ? diff : -diff; \ - ratio = diff / abs_max; \ - if (diff <= tolerance_difference || ratio <= tolerance_ratio) \ - continue; \ - /* fail to meet both tolerance errors */ \ - worst = pos; \ - break; \ - } \ - } \ - if (pos != varsize || worst != -1) { /* diff is found */ \ - double v1, v2; \ - if (ndims[0] == 0) { /* scalar variable */ \ - if (worst == -1) \ - printf("DIFF: scalar variable \"%s\" of type \"%s\"\n", \ - name[0], get_type(xtype[0])); \ - else { \ - v1 = b1[worst]; \ - v2 = b2[worst]; \ - printf("DIFF (tolerance): scalar variable \"%s\" of type \"%s\" of value %g vs %g (difference = %e)\n", \ - name[0], get_type(xtype[0]), v1, v2, v1-v2); \ - } \ - } else { \ - int _i; \ - MPI_Offset *diffStart; \ - diffStart = (MPI_Offset*) malloc(sizeof(MPI_Offset) * ndims[0]); \ - if (worst != -1) pos = worst; \ - v1 = b1[pos]; \ - v2 = b2[pos]; \ - for (_i=ndims[0]-1; _i>=0; _i--) { \ - diffStart[_i] = pos % shape[_i] + start[_i]; \ - pos /= shape[_i]; \ - } \ - if (worst == -1) \ - printf("DIFF: variable \"%s\" of type \"%s\" at element ["OFFFMT, \ - name[0], get_type(xtype[0]), diffStart[0]); \ - else \ - printf("DIFF (tolerance): variable \"%s\" of type \"%s\" at element ["OFFFMT, \ - name[0], get_type(xtype[0]), diffStart[0]); \ - for (_i=1; _i-----------------------------------------------------*/ static void -get_var_names(char *optarg, struct vspec* vspecp) +get_var_names(char *opt_arg, + int *nvars, + char ***names) { - char *cp=optarg, **cpp; - int nvars = 1; + char *cp=opt_arg, **cpp; + *nvars = 1; /* compute number of variable names in comma-delimited list */ - vspecp->nvars = 1; while (*cp++) if (*cp == ',') - nvars++; + (*nvars)++; - vspecp->names = (char **) calloc((size_t)nvars, sizeof(char*)); - if (!vspecp->names) OOM_ERROR + *names = (char **) calloc((size_t)*nvars, sizeof(char*)); + if (!*names) OOM_ERROR - cpp = vspecp->names; + cpp = *names; /* copy variable names into list */ - for (cp = strtok(optarg, ","); + for (cp = strtok(opt_arg, ","); cp != NULL; cp = strtok((char *) NULL, ",")) { @@ -324,27 +106,6 @@ get_var_names(char *optarg, struct vspec* vspecp) if (!*cpp) OOM_ERROR cpp++; } - vspecp->nvars = nvars; -} - -/*----< get_type() >----------------------------------------------------------*/ -static char* -get_type(int type) -{ - switch (type) { - case NC_BYTE: return "NC_BYTE"; - case NC_CHAR: return "NC_CHAR"; - case NC_SHORT: return "NC_SHORT"; - case NC_INT: return "NC_INT"; - case NC_FLOAT: return "NC_FLOAT"; - case NC_DOUBLE: return "NC_DOUBLE"; - case NC_UBYTE: return "NC_UBYTE"; - case NC_USHORT: return "NC_USHORT"; - case NC_UINT: return "NC_UINT"; - case NC_INT64: return "NC_INT64"; - case NC_UINT64: return "NC_UINT64"; - } - return "NC_NAT"; } /*----< main() >--------------------------------------------------------------*/ @@ -352,31 +113,38 @@ int main(int argc, char **argv) { extern char *optarg; extern int optind; - char *name[2]; - int i, j, c, err, rank, nprocs, verbose, quiet, check_tolerance; - int ncid[2], ndims[2], nvars[2], natts[2], recdim[2], *dimids[2], fmt[2]; - int cmp_nvars, check_header, check_variable_list, check_entire_file; - long long numVarDIFF=0, numHeadDIFF=0, varDIFF, numDIFF; + char cmd_opts[1024], **var_names; + int i, c, rank, nprocs, verbose, quiet, check_tolerance; + int first_diff, ncid[2], num_vars; + int check_header, check_variable_list, check_entire_file; + long long numDIFF; double tolerance_ratio, tolerance_difference; - MPI_Offset *shape=NULL, varsize, *start=NULL; - MPI_Offset attlen[2], dimlen[2]; MPI_Comm comm=MPI_COMM_WORLD; MPI_Info info = MPI_INFO_NULL; - nc_type xtype[2]; - struct vspec var_list; MPI_Init(&argc, &argv); MPI_Comm_size(comm, &nprocs); MPI_Comm_rank(comm, &rank); + if (nprocs == 1) + strcpy(cmd_opts, "ncmpidiff"); + else + sprintf(cmd_opts, "Rank %d: ncmpidiff", rank); + + for (i=1; i 0 && ndims[1] > 0) { - if (verbose) - printf("Dimension:\n"); - } else - goto cmp_vars; - - /* check dimensions in 1st file also appear in 2nd file */ - for (i=0; i 0 && nvars[1] > 0) { - if (verbose) - printf("Variables:\n"); - } else - goto cmp_exit; - - /* check variables defined in 1st file and also in 2nd file */ - for (i=0; i 0 && dimids[0][0] == recdim[0]) { /* record variable */ - err = ncmpi_inq_dimlen(ncid[0], recdim[0], &shape[0]); - HANDLE_ERROR - if (shape[0] == 0) { - /* No record has been written to the file, skip comparison */ - free(shape); - free(dimids[0]); - free(dimids[1]); - continue; - } - } - - /* calculate read amount of this process in start[] and shape[] */ - for (j=0; j= nprocs) { - MPI_Offset dimLen = shape[j]; - shape[j] = dimLen / nprocs; - start[j] = shape[j] * rank; - if (rank < dimLen % nprocs) { - start[j] += rank; - shape[j]++; - } - else - start[j] += dimLen % nprocs; - break; - } - } - /* if none of shape[*] >= nprocs, then let all processes compare the - * whole variable - */ - - varsize = 1; - /* block partition the variable along the 1st dimension */ - for (j=0; j= 0) { - err = ncmpi_close(ncid[i]); - HANDLE_ERROR - } + MPI_Info_create(&info); + MPI_Info_set(info, "pnetcdf_subfiling", "disable"); + + numDIFF = ncmpidiff_core(argv[optind], argv[optind+1], + comm, info, verbose, quiet, check_header, + check_variable_list, check_entire_file, + num_vars, var_names, check_tolerance, + first_diff, cmd_opts, tolerance_difference, + tolerance_ratio); + + MPI_Info_free(&info); + + if (num_vars > 0) { + for (i=0; i +#endif + +#include +#include +#include +#include +#include /* stat() */ +#include /* stat() */ +#include /* stat() */ +#include /* errno */ +#include /* INFINITY */ + +#include +#include + +#include + +#ifndef ubyte +#define ubyte unsigned char +#endif +#ifndef ushort +#define ushort unsigned short +#endif +#ifndef uint +#define uint unsigned int +#endif +#ifndef int64 +#define int64 long long +#endif +#ifndef uint64 +#define uint64 unsigned long long +#endif + +#define PRINT_CMD_OPTS \ + if (first_diff && cmd_opts != NULL) { \ + printf("%s\n", cmd_opts); \ + first_diff = 0; \ + } + +#define OOM_ERROR { \ + fprintf(stderr, "Error: calloc() out of memory at line %d\n",__LINE__); \ + exit(1); \ +} + +#define HANDLE_ERROR { \ + if (err != NC_NOERR) { \ + fprintf(stderr, "Error at line %d of file %s (%s)\n", __LINE__, \ + __FILE__, ncmpi_strerror(err)); \ + MPI_Abort(MPI_COMM_WORLD, -1); \ + exit(-1); \ + } \ +} + +#define HANDLE_FILE_ERR(filename) { \ + if (err != NC_NOERR) { \ + fprintf(stderr, "Error at line %d: input file %s (%s)\n", __LINE__, \ + filename, ncmpi_strerror(err)); \ + MPI_Abort(MPI_COMM_WORLD, -1); \ + exit(-1); \ + } \ +} + +#define CHECK_GLOBAL_ATT_DIFF_CHAR { \ + int pos; \ + char *b1 = (char *)calloc((attlen[0] + 1) * 2, sizeof(char)); \ + char *b2 = b1 + attlen[0] + 1; \ + if (!b1) OOM_ERROR \ + err = ncmpi_get_att_text(ncid[0], NC_GLOBAL, name[0], b1); \ + HANDLE_ERROR \ + err = ncmpi_get_att_text(ncid[1], NC_GLOBAL, name[0], b2); \ + HANDLE_ERROR \ + for (pos=0; pos= 0) ? (x) : (-x) +#define UABS(x) (x) + +#define CHECK_VAR_DIFF(type, func, xabs) { \ + int pos, isDiff, worst = -1; \ + type *b1, *b2; \ + b1 = (type *)calloc(varsize * 2, sizeof(type)); \ + if (!b1) OOM_ERROR \ + b2 = b1 + varsize; \ + err = ncmpi_get_vara_##func(ncid[0], varid1, start, shape, b1); \ + HANDLE_ERROR \ + err = ncmpi_get_vara_##func(ncid[1], varid2, start, shape, b2); \ + HANDLE_ERROR \ + if (!check_tolerance) { \ + for (pos=0; pos abs_b2) ? abs_b1 : abs_b2; \ + diff = b1[pos] - b2[pos]; \ + diff = (diff >= 0) ? diff : -diff; \ + ratio = diff / abs_max; \ + if (diff <= tolerance_difference || ratio <= tolerance_ratio) \ + continue; \ + /* fail to meet both tolerance errors */ \ + worst = pos; \ + break; \ + } \ + } \ + if (pos != varsize || worst != -1) { /* diff is found */ \ + double v1, v2; \ + if (ndims[0] == 0) { /* scalar variable */ \ + PRINT_CMD_OPTS \ + if (worst == -1) \ + printf("DIFF: scalar variable \"%s\" of type \"%s\"\n", \ + name[0], get_type(xtype[0])); \ + else { \ + v1 = b1[worst]; \ + v2 = b2[worst]; \ + printf("DIFF (tolerance): scalar variable \"%s\" of type \"%s\" of value %g vs %g (difference = %e)\n", \ + name[0], get_type(xtype[0]), v1, v2, v1-v2); \ + } \ + } else { \ + int _i; \ + MPI_Offset *diffStart; \ + diffStart = (MPI_Offset*) malloc(sizeof(MPI_Offset) * ndims[0]); \ + if (worst != -1) pos = worst; \ + v1 = b1[pos]; \ + v2 = b2[pos]; \ + for (_i=ndims[0]-1; _i>=0; _i--) { \ + diffStart[_i] = pos % shape[_i] + start[_i]; \ + pos /= shape[_i]; \ + } \ + PRINT_CMD_OPTS \ + if (worst == -1) \ + printf("DIFF: variable \"%s\" of type \"%s\" at element ["OFFFMT, \ + name[0], get_type(xtype[0]), diffStart[0]); \ + else \ + printf("DIFF (tolerance): variable \"%s\" of type \"%s\" at element ["OFFFMT, \ + name[0], get_type(xtype[0]), diffStart[0]); \ + for (_i=1; _i----------------------------------------------------------*/ +static char* +get_type(int type) +{ + switch (type) { + case NC_BYTE: return "NC_BYTE"; + case NC_CHAR: return "NC_CHAR"; + case NC_SHORT: return "NC_SHORT"; + case NC_INT: return "NC_INT"; + case NC_FLOAT: return "NC_FLOAT"; + case NC_DOUBLE: return "NC_DOUBLE"; + case NC_UBYTE: return "NC_UBYTE"; + case NC_USHORT: return "NC_USHORT"; + case NC_UINT: return "NC_UINT"; + case NC_INT64: return "NC_INT64"; + case NC_UINT64: return "NC_UINT64"; + } + return "NC_NAT"; +} + +/*----< ncmpidiff_core() >---------------------------------------------------*/ +MPI_Offset ncmpidiff_core(const char *file1, + const char *file2, + MPI_Comm comm, + MPI_Info info, + int verbose, + int quiet, + int check_header, + int check_variable_list, + int check_entire_file, + int num_vars, + char **var_names, + int check_tolerance, + int first_diff, + char *cmd_opts, + double tolerance_difference, + double tolerance_ratio) +{ + char *name[2]; + int i, j, err, rank, nprocs; + int ncid[2], ndims[2], nvars[2], natts[2], recdim[2], *dimids[2], fmt[2]; + int cmp_nvars; + long long numVarDIFF=0, numHeadDIFF=0, varDIFF, numDIFF; + MPI_Offset *shape=NULL, varsize, *start=NULL; + MPI_Offset attlen[2], dimlen[2]; + nc_type xtype[2]; + + MPI_Comm_size(comm, &nprocs); + MPI_Comm_rank(comm, &rank); + + ncid[0] = ncid[1] = -1; + + if (verbose && rank == 0) { + printf("First file: %s\n", file1); + printf("Second file: %s\n", file2); + } + + /* compare file format */ + err = ncmpi_inq_file_format(file1, &fmt[0]); + HANDLE_FILE_ERR(file1) + err = ncmpi_inq_file_format(file2, &fmt[1]); + HANDLE_FILE_ERR(file2) + + if (fmt[0] != fmt[1]) { + if (!quiet && rank == 0) + printf("DIFF: file format (CDF-%d) != (CDF-%d)\n",fmt[0], fmt[1]); + numHeadDIFF++; + /* even formats are different, we continue to compare the contents + * of the files (headers and variables). + */ + } + + /* open files and retrieve headers into memory buffers */ + name[0] = (char*) calloc(NC_MAX_NAME, 1); + if (!name[0]) OOM_ERROR + name[1] = (char*) calloc(NC_MAX_NAME, 1); + if (!name[1]) OOM_ERROR + + /* open files */ + err = ncmpi_open(comm, file1, NC_NOWRITE, info, &ncid[0]); + HANDLE_ERROR + err = ncmpi_open(comm, file1, NC_NOWRITE, info, &ncid[1]); + HANDLE_ERROR + + /* retrieve metadata */ + err = ncmpi_inq(ncid[0], &ndims[0], &nvars[0], &natts[0], &recdim[0]); + HANDLE_ERROR + err = ncmpi_inq(ncid[1], &ndims[1], &nvars[1], &natts[1], &recdim[1]); + HANDLE_ERROR + + /* compare file header */ + if (check_header && rank == 0) { /* only root checks header */ + int attnump; + + /* compare number of dimensions defined */ + if (ndims[0] != ndims[1]) { + if (!quiet) + printf("DIFF: number of dimensions (%d) != (%d)\n",ndims[0], ndims[1]); + numHeadDIFF++; + } + else if (verbose) + printf("SAME: number of dimensions (%d)\n",ndims[0]); + + /* compare number of variables defined */ + if (nvars[0] != nvars[1]) { + if (!quiet) + printf("DIFF: number of variables (%d) != (%d)\n",nvars[0], nvars[1]); + numHeadDIFF++; + } + else if (verbose) + printf("SAME: number of variables (%d)\n",nvars[0]); + + /* compare number of global attributes defined */ + if (natts[0] != natts[1]) { + if (!quiet) + printf("DIFF: number of global attributes (%d) != (%d)\n",natts[0], natts[1]); + numHeadDIFF++; + } + else if (verbose) + printf("SAME: number of global attributes (%d)\n",natts[0]); + + /* compare attributes defined in 1st file and also in 2nd file */ + for (i=0; i 0 && ndims[1] > 0) { + if (verbose) + printf("Dimension:\n"); + } else + goto cmp_vars; + + /* check dimensions in 1st file also appear in 2nd file */ + for (i=0; i 0 && nvars[1] > 0) { + if (verbose) + printf("Variables:\n"); + } else + goto cmp_exit; + + /* check variables defined in 1st file and also in 2nd file */ + for (i=0; i 0 && dimids[0][0] == recdim[0]) { /* record variable */ + err = ncmpi_inq_dimlen(ncid[0], recdim[0], &shape[0]); + HANDLE_ERROR + if (shape[0] == 0) { + /* No record has been written to the file, skip comparison */ + free(shape); + free(dimids[0]); + free(dimids[1]); + continue; + } + } + + /* calculate read amount of this process in start[] and shape[] */ + for (j=0; j= nprocs) { + MPI_Offset dimLen = shape[j]; + shape[j] = dimLen / nprocs; + start[j] = shape[j] * rank; + if (rank < dimLen % nprocs) { + start[j] += rank; + shape[j]++; + } + else + start[j] += dimLen % nprocs; + break; + } + } + /* if none of shape[*] >= nprocs, then let all processes compare the + * whole variable + */ + + varsize = 1; + /* block partition the variable along the 1st dimension */ + for (j=0; j= 0) { + err = ncmpi_close(ncid[i]); + HANDLE_ERROR + } + } + + /* summary of the difference */ + MPI_Reduce(&numVarDIFF, &varDIFF, 1, MPI_LONG_LONG_INT, MPI_SUM, 0, comm); + if (rank == 0 && !quiet) { + if (check_header) { + if (numHeadDIFF == 0) + printf("Headers of two files are the same\n"); + else + printf("Number of differences in header %lld\n",numHeadDIFF); + } + if (check_variable_list) { + if (varDIFF == 0) + printf("Compared variable(s) are the same\n"); + else + printf("Compared variables(s) has %lld differences\n",varDIFF); + } + if (check_entire_file) { + if (varDIFF == 0) + printf("All variables of two files are the same\n"); + else + printf("Number of differences in variables %lld\n",varDIFF); + } + } + + if (rank == 0) numDIFF = varDIFF + numHeadDIFF; + MPI_Bcast(&numDIFF, 1, MPI_LONG_LONG_INT, 0, comm); + + return numDIFF; +} diff --git a/src/utils/ncmpidiff/ncmpidiff_core.h b/src/utils/ncmpidiff/ncmpidiff_core.h new file mode 100644 index 0000000000..cb3d4dbfec --- /dev/null +++ b/src/utils/ncmpidiff/ncmpidiff_core.h @@ -0,0 +1,24 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#include + +extern +MPI_Offset ncmpidiff_core(const char *file1, + const char *file2, + MPI_Comm comm, + MPI_Info info, + int verbose, + int quiet, + int check_header, + int check_variable_list, + int check_entire_file, + int num_vars, + char **var_names, + int check_tolerance, + int first_diff, + char *cmd_opts, + double tolerance_difference, + double tolerance_ratio); diff --git a/src/utils/ncmpidump/ncmpidump.c b/src/utils/ncmpidump/ncmpidump.c index be3d72482b..77829ef184 100644 --- a/src/utils/ncmpidump/ncmpidump.c +++ b/src/utils/ncmpidump/ncmpidump.c @@ -51,9 +51,9 @@ static void pr_att_string(size_t len, const char* string); static void pr_att_vals(nc_type type, size_t len, const double* vals); static void pr_att(int ncid, int varid, const char *varname, int ia); static void do_ncdump(const char* path, struct fspec* specp); -static void make_lvars(char* optarg, struct fspec* fspecp); -static void set_sigdigs( const char* optarg); -static void set_precision( const char *optarg); +static void make_lvars(char* opt_arg, struct fspec* fspecp); +static void set_sigdigs( const char* opt_arg); +static void set_precision( const char *opt_arg); int main(int argc, char** argv); #define STREQ(a, b) (*(a) == *(b) && strcmp((a), (b)) == 0) @@ -611,9 +611,9 @@ do_ncdump(const char *path, struct fspec* specp) static void -make_lvars(char *optarg, struct fspec* fspecp) +make_lvars(char *opt_arg, struct fspec* fspecp) { - char *cp = optarg; + char *cp = opt_arg; int nvars = 1; char ** cpp; @@ -628,7 +628,7 @@ make_lvars(char *optarg, struct fspec* fspecp) cpp = fspecp->lvars; /* copy variable names into list */ - for (cp = strtok(optarg, ","); + for (cp = strtok(opt_arg, ","); cp != NULL; cp = strtok((char *) NULL, ",")) { @@ -647,15 +647,15 @@ make_lvars(char *optarg, struct fspec* fspecp) * command-line and update the default data formats appropriately. */ static void -set_sigdigs(const char *optarg) +set_sigdigs(const char *opt_arg) { char *ptr1 = 0; char *ptr2 = 0; int flt_digits = FLT_DIGITS; /* default floating-point digits */ int dbl_digits = DBL_DIGITS; /* default double-precision digits */ - if (optarg != 0 && (int) strlen(optarg) > 0 && optarg[0] != ',') - flt_digits = (int)strtol(optarg, &ptr1, 10); + if (opt_arg != 0 && (int) strlen(opt_arg) > 0 && opt_arg[0] != ',') + flt_digits = (int)strtol(opt_arg, &ptr1, 10); if (flt_digits < 1 || flt_digits > 20) error("unreasonable value for float significant digits: %d", @@ -679,15 +679,15 @@ set_sigdigs(const char *optarg) * and update the default data formats appropriately. */ static void -set_precision(const char *optarg) +set_precision(const char *opt_arg) { char *ptr1 = 0; char *ptr2 = 0; int flt_digits = FLT_DIGITS; /* default floating-point digits */ int dbl_digits = DBL_DIGITS; /* default double-precision digits */ - if (optarg != 0 && (int) strlen(optarg) > 0 && optarg[0] != ',') { - flt_digits = (int)strtol(optarg, &ptr1, 10); + if (opt_arg != 0 && (int) strlen(opt_arg) > 0 && opt_arg[0] != ',') { + flt_digits = (int)strtol(opt_arg, &ptr1, 10); float_precision_specified = 1; } diff --git a/src/utils/ncmpidump/vardata.c b/src/utils/ncmpidump/vardata.c index 11310a9f91..b8cec8b995 100644 --- a/src/utils/ncmpidump/vardata.c +++ b/src/utils/ncmpidump/vardata.c @@ -82,9 +82,9 @@ static double double_eps; static float float_epsilon(void) { - float float_eps; + float val_float_eps; #ifndef NO_FLOAT_H - float_eps = FLT_EPSILON; + val_float_eps = FLT_EPSILON; #else /* NO_FLOAT_H */ { float etop, ebot, eps; @@ -103,19 +103,19 @@ float_epsilon(void) ebot = eps; eps = ebot + (etop - ebot)/two; } - float_eps = two * etop; + val_float_eps = two * etop; } #endif /* NO_FLOAT_H */ - return float_eps; + return val_float_eps; } static double double_epsilon(void) { - double double_eps; + double val_double_eps; #ifndef NO_FLOAT_H - double_eps = DBL_EPSILON; + val_double_eps = DBL_EPSILON; #else /* NO_FLOAT_H */ { double etop, ebot, eps; @@ -134,10 +134,10 @@ double_epsilon(void) ebot = eps; eps = ebot + (etop - ebot)/two; } - double_eps = two * etop; + val_double_eps = two * etop; } #endif /* NO_FLOAT_H */ - return double_eps; + return val_double_eps; } diff --git a/src/utils/ncmpigen/genlib.c b/src/utils/ncmpigen/genlib.c index 1e558a92fb..26ac82d890 100644 --- a/src/utils/ncmpigen/genlib.c +++ b/src/utils/ncmpigen/genlib.c @@ -1548,7 +1548,6 @@ cl_fortran(void) } fline(stmnt); if (v->type != NC_CHAR) { - char *sp; sprintf(stmnt, "%s %s(", ncftype(v->type), v->lname); /* reverse dimensions for FORTRAN */ @@ -1582,12 +1581,12 @@ cl_fortran(void) if (v->has_data) { fline(v->data_stmnt); } else { /* generate data statement for FILL record */ - MPI_Offset rec_len = 1; + MPI_Offset rec_size = 1; for (idim = 1; idim < v->ndims; idim++) { - rec_len *= dims[v->dims[idim]].size; + rec_size *= dims[v->dims[idim]].size; } sprintf(stmnt,"data %s /%lu * %s/", v->lname, - (unsigned long) rec_len, + (unsigned long) rec_size, f_fill_name(v->type)); fline(stmnt); } @@ -1695,9 +1694,9 @@ close_netcdf(void) void -check_err(int stat, const char *ncmpi_func, const char *calling_func, int lineno, const char *calling_file) { +check_err(int stat, const char *ncmpi_func, const char *calling_func, int linenum, const char *calling_file) { if (stat != NC_NOERR) { - fprintf(stderr, "ncmpigen error when calling %s in %s() at line %d of %s: %s\n", ncmpi_func, calling_func, lineno, calling_file, ncmpi_strerror(stat)); + fprintf(stderr, "ncmpigen error when calling %s in %s() at line %d of %s: %s\n", ncmpi_func, calling_func, linenum, calling_file, ncmpi_strerror(stat)); derror_count++; } } diff --git a/src/utils/ncmpigen/load.c b/src/utils/ncmpigen/load.c index 69fe54e3a9..788450aa26 100644 --- a/src/utils/ncmpigen/load.c +++ b/src/utils/ncmpigen/load.c @@ -394,7 +394,7 @@ fstrcat( */ static void f_var_init( - int varnum, /* which variable */ + int varid, /* which variable */ void *rec_start /* start of data */ ) { @@ -415,9 +415,9 @@ f_var_init( int ival; /* load variable with data values */ - sprintf(stmnt, "data %s /",vars[varnum].lname); + sprintf(stmnt, "data %s /",vars[varid].lname); stmnt_len = strlen(stmnt); - switch (vars[varnum].type) { + switch (vars[varid].type) { case NC_BYTE: charvalp = (char *) rec_start; for (ival = 0; ival < var_len-1; ival++) { @@ -524,10 +524,10 @@ f_var_init( /* For record variables, store data statement for later use; otherwise, just print it. */ - if (vars[varnum].ndims > 0 && vars[varnum].dims[0] == rec_dim) { + if (vars[varid].ndims > 0 && vars[varid].dims[0] == rec_dim) { char *dup_stmnt = (char*) emalloc(strlen(stmnt)+1); strcpy(dup_stmnt, stmnt); /* ULTRIX missing strdup */ - vars[varnum].data_stmnt = dup_stmnt; + vars[varid].data_stmnt = dup_stmnt; } else { fline(stmnt); } diff --git a/src/utils/ncmpigen/ncmpigentab.c b/src/utils/ncmpigen/ncmpigentab.c index 117e7d4945..0699867887 100644 --- a/src/utils/ncmpigen/ncmpigentab.c +++ b/src/utils/ncmpigen/ncmpigentab.c @@ -1,6 +1,8 @@ +/* #ifndef lint static const char yysccsid[] = "@(#)yaccpar 1.9 (Berkeley) 02/21/93"; #endif +*/ #include #include @@ -617,7 +619,6 @@ static int yygrowstack(void) #define YYABORT goto yyabort #define YYREJECT goto yyabort #define YYACCEPT goto yyaccept -#define YYERROR goto yyerrlab int yyparse(void) { @@ -686,11 +687,6 @@ yyparse(void) yyerror("syntax error"); -#ifdef lint - goto yyerrlab; -#endif - -yyerrlab: ++yynerrs; yyinrecovery: diff --git a/src/utils/ncoffsets/ncoffsets.c b/src/utils/ncoffsets/ncoffsets.c index 977f199dd2..59e62d0bc9 100644 --- a/src/utils/ncoffsets/ncoffsets.c +++ b/src/utils/ncoffsets/ncoffsets.c @@ -14,6 +14,7 @@ #include /* read() */ #include /* assert() */ #include /* check for Endianness, uint32_t*/ +#include /* uint32_t, uint64_t */ static int is_little_endian; @@ -225,38 +226,29 @@ static int check_little_endian(void) { (((a) & 0x00FF000000000000ULL) >> 40) | \ (((a) & 0xFF00000000000000ULL) >> 56) ) -static void -swap4b(void *val) -{ - uint32_t *op = (uint32_t*)val; - *op = SWAP4B(*op); -} - -static void -swap8b(unsigned long long *val) -{ - uint64_t *op = (uint64_t*)val; - *op = SWAP8B(*op); -} static unsigned long long get_uint64(bufferinfo *gbp) { /* retrieve a 64bit unisgned integer and return it as unsigned long long */ - unsigned long long tmp; - memcpy(&tmp, gbp->pos, 8); - if (is_little_endian) swap8b(&tmp); - gbp->pos = (char*)gbp->pos + 8; - return tmp; + uint64_t tmp; + memcpy(&tmp, gbp->pos, sizeof(uint64_t)); + + if (is_little_endian) tmp = SWAP8B(tmp); + + gbp->pos = (char*)gbp->pos + sizeof(uint64_t); + return (unsigned long long)tmp; } static unsigned int get_uint32(bufferinfo *gbp) { /* retrieve a 32bit unisgned integer and return it as unsigned int */ - unsigned int tmp; - memcpy(&tmp, gbp->pos, 4); - if (is_little_endian) swap4b(&tmp); - gbp->pos = (char*)gbp->pos + 4; - return tmp; + uint32_t tmp; + memcpy(&tmp, gbp->pos, sizeof(uint32_t)); + + if (is_little_endian) tmp = SWAP4B(tmp); + + gbp->pos = (char*)gbp->pos + sizeof(uint32_t); + return (unsigned int)tmp; } static int @@ -471,8 +463,7 @@ ncmpii_NC_computeshapes(NC *ncp) ncp->recsize += (*vpp)->len; } else { - if (first_var == NULL) - first_var = *vpp; + if (first_var == NULL) first_var = *vpp; /* * Overwritten each time thru. * Usually overwritten in first_rec != NULL clause. @@ -1802,9 +1793,9 @@ struct fspec { }; static void -make_lvars(char *optarg, struct fspec* fspecp) +make_lvars(char *opt_arg, struct fspec* fspecp) { - char *cp = optarg; + char *cp = opt_arg; int nvars = 1; char ** cpp; @@ -1819,7 +1810,7 @@ make_lvars(char *optarg, struct fspec* fspecp) cpp = fspecp->lvars; /* copy variable names into list */ - for (cp = strtok(optarg, ","); + for (cp = strtok(opt_arg, ","); cp != NULL; cp = strtok((char *) NULL, ",")) { @@ -1879,8 +1870,9 @@ static void usage(char *cmd) { char *help = -"Usage: %s [-h] | [-x] | [-sgr] [-v var1[,...]] file\n" +"Usage: %s [-h | -d | -x | -s | -g | -r | -v var1[,...]] file\n" " [-h] Print help\n" +" [-d] Enable debug mode (verbose output)\n" " [-v var1[,...]] Output for variable(s) ,... only\n" " [-s] Output variable size. For record variables, output\n" " the size of one record only\n" @@ -1898,7 +1890,7 @@ int main(int argc, char *argv[]) { extern int optind; extern char *optarg; - char *filename, *env_str; + char *filename; int i, j, err, opt; int print_var_size=0, print_gap=0, check_gap=0, print_all_rec=0; NC *ncp; @@ -1906,8 +1898,10 @@ int main(int argc, char *argv[]) fspecp = (struct fspec*) calloc(1, sizeof(struct fspec)); + verbose_debug = 0; + /* get command-line arguments */ - while ((opt = getopt(argc, argv, "v:sghqxr")) != EOF) { + while ((opt = getopt(argc, argv, "v:dsghqxr")) != EOF) { switch(opt) { case 'v': make_lvars(optarg, fspecp); break; @@ -1919,6 +1913,8 @@ int main(int argc, char *argv[]) break; case 'x': check_gap = 1; break; + case 'd': verbose_debug = 1; + break; case 'h': default: usage(argv[0]); free(fspecp); @@ -1937,10 +1933,6 @@ int main(int argc, char *argv[]) } filename = argv[optind]; /* required argument */ - verbose_debug = 0; - env_str = getenv("PNETCDF_VERBOSE_DEBUG_MODE"); - if (env_str != NULL && *env_str != '0') verbose_debug = 1; - /* find Endianness of the running machine */ is_little_endian = check_little_endian(); @@ -2070,7 +2062,7 @@ int main(int argc, char *argv[]) /* print fixed-size variables first */ if (num_fix_vars) printf("\nfixed-size variables:\n"); for (i=0; inlvars; i++) { - int j, ndims, cdots; + int ndims, cdots; char type_str[16], str[1024], *line; size_t lineLen; long long size; @@ -2162,7 +2154,7 @@ int main(int argc, char *argv[]) /* print record variables */ if (num_rec_vars) printf("\nrecord variables:\n"); for (i=0; inlvars; i++) { - int j, ndims, cdots; + int ndims, cdots; char type_str[16], str[1024], *line; size_t lineLen; long long var_begin, var_end, size, numrecs; diff --git a/src/utils/ncvalidator/ncvalidator.c b/src/utils/ncvalidator/ncvalidator.c index da58bcf6c4..be06e14a60 100644 --- a/src/utils/ncvalidator/ncvalidator.c +++ b/src/utils/ncvalidator/ncvalidator.c @@ -14,6 +14,7 @@ #include /* check for Endianness, uint32_t*/ #include #include /* errno */ +#include /* uint32_t, uint64_t */ #define X_ALIGN 4 #define X_INT_MAX 2147483647 @@ -230,40 +231,30 @@ static int check_little_endian(void) return (*((uint8_t*)(&i))) == 0x67; } -static void -swap4b(void *val) -{ - uint32_t *op = (uint32_t*)val; - *op = SWAP4B(*op); -} - -static void -swap8b(unsigned long long *val) -{ - uint64_t *op = (uint64_t*)val; - *op = SWAP8B(*op); -} - static unsigned long long get_uint64(bufferinfo *gbp) { /* retrieve a 64bit unsigned integer and return it as unsigned long long */ - unsigned long long tmp; - memcpy(&tmp, gbp->pos, 8); - if (gbp->is_little_endian) swap8b(&tmp); - gbp->pos = (char*)gbp->pos + 8; /* advance gbp->pos 8 bytes */ - return tmp; + uint64_t tmp; + memcpy(&tmp, gbp->pos, sizeof(uint64_t)); + + if (gbp->is_little_endian) tmp = SWAP8B(tmp); + + gbp->pos = (char*)gbp->pos + sizeof(uint64_t); /* advance gbp->pos 8 bytes */ + return (unsigned long long)tmp; } static unsigned int get_uint32(bufferinfo *gbp) { /* retrieve a 32bit unsigned integer and return it as unsigned int */ - unsigned int tmp; - memcpy(&tmp, gbp->pos, 4); - if (gbp->is_little_endian) swap4b(&tmp); - gbp->pos = (char*)gbp->pos + 4; /* advance gbp->pos 4 bytes */ - return tmp; + uint32_t tmp; + memcpy(&tmp, gbp->pos, sizeof(uint32_t)); + + if (gbp->is_little_endian) tmp = SWAP4B(tmp); + + gbp->pos = (char*)gbp->pos + sizeof(uint32_t); /* advance gbp->pos 4 bytes */ + return (unsigned int)tmp; } static void @@ -1397,7 +1388,7 @@ val_get_NC_attr(int fd, NC_attr **attrpp, const char *loc) { - char *name=NULL, xloc[1024]; + char *name=NULL, xloc[2048]; int err, status=NC_NOERR; size_t err_addr, name_len; nc_type xtype; @@ -1901,8 +1892,8 @@ val_NC_check_vlens(NC *ncp) and format 2. */ long long ii, vlen_max, rec_vars_count; long long large_fix_vars_count, large_rec_vars_count; - long long first_large_fix_var, first_large_rec_var; - long long second_large_fix_var, second_large_rec_var; + long long first_large_fix_var=0, first_large_rec_var=0; + long long second_large_fix_var=0, second_large_rec_var=0; int last = 0; if (ncp->vars.ndefined == 0) @@ -2401,7 +2392,7 @@ val_get_NC(int fd, NC *ncp) /* check zero padding in the blank space betwee header size and extent */ if (repair && ncp->begin_var - ncp->xsz > 0) { - size_t i, gap = ncp->begin_var - ncp->xsz; + size_t gap = ncp->begin_var - ncp->xsz; ssize_t readLen; char *buf = (char*) malloc(gap); @@ -2448,7 +2439,7 @@ val_get_NC(int fd, NC *ncp) #ifndef BUILD_CDFDIFF -/* File system types recognized by ROMIO in MPICH 4.0.0 */ +/* File system types recognized by ROMIO in MPICH 4.0.0, and by PnetCDF */ static const char* fstypes[] = {"ufs", "nfs", "xfs", "pvfs2", "gpfs", "panfs", "lustre", "daos", "testfs", "ime", "quobyte", NULL}; /* Return a pointer to filename by removing the file system type prefix name if diff --git a/test/C/Makefile.am b/test/C/Makefile.am index 4d0668b6fd..0a24235a86 100644 --- a/test/C/Makefile.am +++ b/test/C/Makefile.am @@ -13,21 +13,17 @@ AM_DEFAULT_SOURCE_EXT = .c AM_CPPFLAGS = -I$(top_srcdir)/src/include AM_CPPFLAGS += -I$(srcdir)/../common AM_CPPFLAGS += -I$(top_builddir)/src/include +AM_CPPFLAGS += -I$(top_srcdir)/src/utils/ncmpidiff LDADD = $(top_builddir)/src/libs/libpnetcdf.la ../common/libtestutils.la +LDADD += $(top_builddir)/src/utils/ncmpidiff/libncmpidiff_core.la LDADD += @NETCDF4_LDFLAGS@ @ADIOS_LDFLAGS@ @NETCDF4_LIBS@ @ADIOS_LIBS@ if DECL_MPI_OFFSET AM_CPPFLAGS += -DHAVE_DECL_MPI_OFFSET endif -TESTPROGRAMS = pres_temp_4D_wr \ - pres_temp_4D_rd - -pres_temp_4D_wr_SOURCES = pres_temp_4D_wr.c pres_temp_4D.h -pres_temp_4D_rd_SOURCES = pres_temp_4D_rd.c pres_temp_4D.h - -check_PROGRAMS = $(TESTPROGRAMS) +check_PROGRAMS = pres_temp_4D_wr_rd # autimake 1.11.3 has not yet implemented AM_TESTS_ENVIRONMENT # For newer versions, we can use AM_TESTS_ENVIRONMENT instead @@ -36,34 +32,38 @@ TESTS_ENVIRONMENT += export srcdir="$(srcdir)"; TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; TESTS_ENVIRONMENT += export TESTMPIRUN="$(TESTMPIRUN)"; -TESTS_ENVIRONMENT += export PNETCDF_DEBUG="$(PNETCDF_DEBUG)"; -TESTS_ENVIRONMENT += export TESTPROGRAMS="$(TESTPROGRAMS)"; TESTS_ENVIRONMENT += export check_PROGRAMS="$(check_PROGRAMS)"; -TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER="$(ENABLE_BURST_BUFFER)"; -TESTS_ENVIRONMENT += export ENABLE_NETCDF4="$(ENABLE_NETCDF4)"; -TESTS = seq_runs.sh -TEST_EXTENSIONS = .sh -# LOG_COMPILER = $(srcdir)/wrap_runs.sh -# SH_LOG_COMPILER = +if PNETCDF_DEBUG + TESTS_ENVIRONMENT += export PNETCDF_DEBUG=1; +endif +if ENABLE_BURST_BUFFER + TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER=1; +endif +if ENABLE_NETCDF4 + TESTS_ENVIRONMENT += export ENABLE_NETCDF4=1; +endif +if ENABLE_THREAD_SAFE + TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE=1; +endif +if MIMIC_LUSTRE + TESTS_ENVIRONMENT += export MIMIC_LUSTRE=1; +endif -# Dependency: pres_temp_4D_rd reads the output file from pres_temp_4D_wr. -# To support parallel "make -jN check", we add dependencies below and automake -# states it currently works only for tests that end in one of the suffixes -# listed in TEST_EXTENSIONS. $(EXEEXT) are appended to work around. -pres_temp_4D_rd$(EXEEXT).log: pres_temp_4D_wr$(EXEEXT).log +TESTS = $(check_PROGRAMS) +TEST_EXTENSIONS = .sh +LOG_COMPILER = $(srcdir)/seq_runs.sh +SH_LOG_COMPILER = EXTRA_DIST = seq_runs.sh parallel_run.sh -CLEANFILES = $(TESTOUTDIR)/pres_temp_4D.nc \ - $(TESTOUTDIR)/pres_temp_4D.nc4 \ - $(TESTOUTDIR)/pres_temp_4D.bb.nc \ +CLEANFILES = $(TESTOUTDIR)/*.nc $(TESTOUTDIR)/*bb \ core core.* *.gcda *.gcno *.gcov gmon.out ../common/libtestutils.la: set -e; cd ../common && $(MAKE) $(MFLAGS) tests -ptest ptests ptest4: $(TESTPROGRAMS) +ptest ptests ptest4: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 4 MPI processes" @echo "===========================================================" diff --git a/test/C/parallel_run.sh b/test/C/parallel_run.sh index 9fe1f41a8a..5cf76c787a 100755 --- a/test/C/parallel_run.sh +++ b/test/C/parallel_run.sh @@ -1,23 +1,30 @@ #!/bin/bash # -# Copyright (C) 2018, Northwestern University and Argonne National Laboratory +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. # # Exit immediately if a command exits with a non-zero status. set -e -VALIDATOR=../../src/utils/ncvalidator/ncvalidator -NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff +DRY_RUN=no +VERBOSE=no -# remove file system type prefix if there is any -OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $MPIRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $MPIRUN $@ + fi +} MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "MPIRUN = ${MPIRUN}" +# echo "check_PROGRAMS=${check_PROGRAMS}" -# echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -26,48 +33,33 @@ fi # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS -for j in ${safe_modes} ; do -for intra_aggr in 0 1 ; do - if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" - else - export PNETCDF_HINTS= - fi - if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" - fi - export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" - ${MPIRUN} ./pres_temp_4D_wr ${TESTOUTDIR}/pres_temp_4D.nc - ${MPIRUN} ./pres_temp_4D_rd ${TESTOUTDIR}/pres_temp_4D.nc - # echo "--- validating file ${TESTOUTDIR}/pres_temp_4D.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/pres_temp_4D.nc - # echo "" - - if test "x${ENABLE_BURST_BUFFER}" = x1 ; then - # echo "---- test burst buffering feature" - saved_PNETCDF_HINTS=${PNETCDF_HINTS} - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" - ${MPIRUN} ./pres_temp_4D_wr ${TESTOUTDIR}/pres_temp_4D.bb.nc - ${MPIRUN} ./pres_temp_4D_rd ${TESTOUTDIR}/pres_temp_4D.bb.nc - export PNETCDF_HINTS=${saved_PNETCDF_HINTS} - - # echo "--- validating file ${TESTOUTDIR}/pres_temp_4D.bb.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/pres_temp_4D.bb.nc - - # echo "--- ncmpidiff pres_temp_4D.nc pres_temp_4D.bb.nc ---" - ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/pres_temp_4D.nc ${TESTOUTDIR}/pres_temp_4D.bb.nc - fi - - if test "x${ENABLE_NETCDF4}" = x1 ; then - # echo "test netCDF-4 feature" - ${MPIRUN} ./pres_temp_4D_wr ${TESTOUTDIR}/pres_temp_4D.nc4 4 - ${MPIRUN} ./pres_temp_4D_rd ${TESTOUTDIR}/pres_temp_4D.nc4 - # Validator does not support nc4 - fi -done -done - -rm -f ${OUTDIR}/*.nc -rm -f ${OUTDIR}/*.nc4 +if test "x$MIMIC_LUSTRE" != x1 ; then + PNETCDF_HINTS="cb_nodes=2" +fi + +for i in ${check_PROGRAMS} ; do + + # SECONDS=0 + # start_ns=$(date +%s.%4N) + + exe_name=`basename $i` + + for j in ${safe_modes} ; do + + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi + + run_cmd ./$i -q -o ${TESTOUTDIR}/${exe_name}.nc + + done # safe_modes + + # # echo "Elapsed: $SECONDS seconds" + # end_ns=$(date +%s.%4N) + # # Calculate difference (requires bc for floating point math) + # elapsed_ns=$(echo "$end_ns - $start_ns" | bc) + # echo "Elapsed time: ${elapsed_ns} seconds" + +done # check_PROGRAMS diff --git a/test/C/pres_temp_4D.h b/test/C/pres_temp_4D.h deleted file mode 100644 index f70677c069..0000000000 --- a/test/C/pres_temp_4D.h +++ /dev/null @@ -1,37 +0,0 @@ -/* We are writing and reading 4D data, a 2 x 6 x 12 lvl-lat-lon grid, with 2 - * timesteps of data. */ -#define NDIMS 4 -#define NLAT 6 -#define NLON 12 -#define LAT_NAME "latitude" -#define LON_NAME "longitude" -#define NREC 2 -#define REC_NAME "time" -#define LVL_NAME "level" -#define NLVL 10 - -/* Names of things. */ -#define PRES_NAME "pressure" -#define TEMP_NAME "temperature" -#define UNITS "units" -#define DEGREES_EAST "degrees_east" -#define DEGREES_NORTH "degrees_north" - -/* These are used to construct some example data, and to calculate the values - * we expect to find. */ -#define SAMPLE_PRESSURE 900.0 -#define SAMPLE_TEMP 9.0 -#define START_LAT 25.0 -#define START_LON -125.0 - -/* For the units attributes. */ -#define UNITS "units" -#define PRES_UNITS "hPa" -#define TEMP_UNITS "celsius" -#define LAT_UNITS "degrees_north" -#define LON_UNITS "degrees_east" -#define MAX_ATT_LEN 80 - -/* This is the name of the data file we will create and read back. */ -#define FILE_NAME "pres_temp_4D.nc" - diff --git a/test/C/pres_temp_4D_rd.c b/test/C/pres_temp_4D_rd.c deleted file mode 100644 index 84257c3b09..0000000000 --- a/test/C/pres_temp_4D_rd.c +++ /dev/null @@ -1,229 +0,0 @@ -/* - * Copyright (C) 2003, Northwestern University and Argonne National Laboratory - * See COPYRIGHT notice in top-level directory. - */ -/* $Id$ */ - -/* - This is an example which reads some 4D pressure and - temperatures. The data file read by this program is produced by the - companion program pres_temp_4D_wr.c. It is intended to illustrate - the use of the netCDF C API. - - This program is part of the netCDF tutorial: - http://www.unidata.ucar.edu/software/netcdf/docs/netcdf-tutorial - - Full documentation of the netCDF C API can be found at: - http://www.unidata.ucar.edu/software/netcdf/docs/netcdf-c - - $Id$ -*/ - -#include -#include -#include -#include /* basename() */ -#include -#include -#include - -#include "pres_temp_4D.h" - -int main(int argc, char **argv) -{ - int rank, nprocs, ncid, pres_varid, temp_varid; - int lat_varid, lon_varid; - - /* The start and count arrays will tell the netCDF library where to - read our data. */ - MPI_Offset start[NDIMS], count[NDIMS]; - - /* Program variables to hold the data we will read. We will only - need enough space to hold one timestep of data; one record. */ - float **pres_in = NULL; /* [NLVL/nprocs][NLAT][NLON] */ - float **temp_in = NULL; /* [NLVL/nprocs][NLAT][NLON] */ - - /* These program variables hold the latitudes and longitudes. */ - float lats[NLAT], lons[NLON]; - - /* Loop indexes. */ - int lvl, lat, lon, rec, i = 0; - - /* Error handling. */ - int err, nerrs = 0; - - char *filename = FILE_NAME; - - MPI_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - if (argc > 3) { - if (!rank) - printf("Usage: %s [filename]\n", argv[0]); - MPI_Finalize(); - return 1; - } - - if (argc > 1) filename = argv[1]; - - /* Open the file. */ - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid); - if (err != NC_NOERR) { /* fatal error */ - if (rank == 0) - fprintf(stderr,"Error: failed to open file %s (%s)\n",filename,ncmpi_strerror(err)); - goto err_out; - } - - if (rank == 0) { - char *cmd_str = (char *)malloc(strlen(argv[0]) + 256); - int format; - err = ncmpi_inq_format(ncid, &format); CHECK_ERR - if (format == NC_FORMAT_NETCDF4) - sprintf(cmd_str, "*** TESTING C %s for reading NetCDF-4 file", basename(argv[0])); - else - sprintf(cmd_str, "*** TESTING C %s for reading classic file", basename(argv[0])); - printf("%-66s ------ ", cmd_str); - free(cmd_str); - } - - /* Get the varids of the latitude and longitude coordinate variables. */ - err = ncmpi_inq_varid(ncid, LAT_NAME, &lat_varid); - CHECK_ERR - err = ncmpi_inq_varid(ncid, LON_NAME, &lon_varid); - CHECK_ERR - - /* Read the coordinate variable data. */ - memset(lats, 0, sizeof(float) * NLAT); - memset(lons, 0, sizeof(float) * NLON); - err = ncmpi_get_var_float_all(ncid, lat_varid, &lats[0]); - CHECK_ERR - err = ncmpi_get_var_float_all(ncid, lon_varid, &lons[0]); - CHECK_ERR - - /* Check the coordinate variable data. */ - for (lat = 0; lat < NLAT; lat++) { - float exp = START_LAT + 5. * lat; - if (lats[lat] != exp) { - printf("\nError at line %d in %s: lats[%d] expect %.1f but got %.1f\n", - __LINE__, __FILE__, lat, exp, lats[lat]); - nerrs++; - break; - } - } - for (lon = 0; lon < NLON; lon++) { - float exp = START_LON + 5. * lon; - if (lons[lon] != START_LON + 5. * lon) { - printf("\nError at line %d in %s: lons[%d] expect %.1f but got %.1f\n", - __LINE__, __FILE__, lon, exp, lons[lon]); - nerrs++; - break; - } - } - - /* Get the varids of the pressure and temperature netCDF variables. */ - err = ncmpi_inq_varid(ncid, PRES_NAME, &pres_varid); - CHECK_ERR - err = ncmpi_inq_varid(ncid, TEMP_NAME, &temp_varid); - CHECK_ERR - - /* Read the data. Since we know the contents of the file we know that the - * data arrays in this program are the correct size to hold one timestep. - */ - count[0] = 1; - count[2] = NLAT; - count[3] = NLON; - start[2] = 0; - start[3] = 0; - - /* divide NLVL dimension among processes */ - count[1] = NLVL / nprocs; - start[1] = count[1] * rank; - if (rank < NLVL % nprocs) { - start[1] += rank; - count[1]++; - } - else { - start[1] += NLVL % nprocs; - } - if (count[1] == 0) - start[1] = 0; - - /* allocate read buffers */ - pres_in = (float **)malloc(sizeof(float *) * count[1] * 2); - temp_in = pres_in + count[1]; - if (count[1] > 0) { - pres_in[0] = (float *)malloc(sizeof(float) * count[1] * 2 * NLAT * NLON); - temp_in[0] = pres_in[0] + count[1] * NLAT * NLON; - for (i = 1; i < count[1]; i++) { - pres_in[i] = pres_in[i - 1] + NLAT * NLON; - temp_in[i] = temp_in[i - 1] + NLAT * NLON; - } - } - - /* Read and check one record at a time. */ - for (rec = 0; rec < NREC; rec++) { - start[0] = rec; - err = ncmpi_get_vara_float_all(ncid, pres_varid, start, count, &pres_in[0][0]); - CHECK_ERR - err = ncmpi_get_vara_float_all(ncid, temp_varid, start, count, &temp_in[0][0]); - CHECK_ERR - - /* Check the data. */ - i = (int)start[1] * NLAT * NLON; - for (lvl = 0; lvl < count[1]; lvl++) - for (lat = 0; lat < NLAT; lat++) - for (lon = 0; lon < NLON; lon++) { - float exp = SAMPLE_PRESSURE + i; - int indx = lat * NLON + lon; - if (pres_in[lvl][indx] != exp) { - printf("\nError at line %d in %s: %s[%d][%d][%d][%d] expect %.1f but got %.1f\n", - __LINE__, __FILE__, PRES_NAME, rec, lvl, lat, lon, exp, pres_in[lvl][indx]); - nerrs++; - goto fn_exit; - } - exp = SAMPLE_TEMP + i; - if (temp_in[lvl][indx] != exp) { - printf("\nError at line %d in %s: %s[%d][%d][%d][%d] expect %.1f but got %.1f\n", - __LINE__, __FILE__, TEMP_NAME, rec, lvl, lat, lon, exp, temp_in[lvl][indx]); - nerrs++; - goto fn_exit; - } - i++; - } - } /* next record */ - -fn_exit: - /* Close the file. */ - err = ncmpi_close(ncid); - CHECK_ERR - - if (pres_in != NULL) { - if (pres_in[0] != NULL) - free(pres_in[0]); - free(pres_in); - } - - /* check if there is any malloc residue */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } - -err_out: - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) - printf(FAIL_STR, nerrs); - else - printf(PASS_STR); - } - - MPI_Finalize(); - return (nerrs > 0); -} diff --git a/test/C/pres_temp_4D_wr.c b/test/C/pres_temp_4D_wr.c deleted file mode 100644 index 8039d0dcdb..0000000000 --- a/test/C/pres_temp_4D_wr.c +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Copyright (C) 2003, Northwestern University and Argonne National Laboratory - * See COPYRIGHT notice in top-level directory. - */ -/* $Id$ */ - -/* - This is an example program which writes some 4D pressure and - temperatures. It is intended to illustrate the use of the netCDF - C API. The companion program pres_temp_4D_rd.c shows how - to read the netCDF data file created by this program. - - This program is part of the netCDF tutorial: - http://www.unidata.ucar.edu/software/netcdf/docs/netcdf-tutorial - - Full documentation of the netCDF C API can be found at: - http://www.unidata.ucar.edu/software/netcdf/docs/netcdf-c - - $Id$ -*/ - -#include -#include -#include -#include /* basename() */ -#include -#include -#include - -#include "pres_temp_4D.h" - -int main(int argc, char **argv) -{ - /* IDs for the netCDF file, dimensions, and variables. */ - int nprocs, rank, nerrs = 0; - int ncid; - int lon_dimid, lat_dimid, lvl_dimid, rec_dimid; - int lat_varid, lon_varid, pres_varid, temp_varid; - int dimids[NDIMS]; - int format = NC_FORMAT_CLASSIC; - - /* The start and count arrays will tell the netCDF library where to - write our data. */ - MPI_Offset start[NDIMS], count[NDIMS]; - - /* Program variables to hold the data we will write out. We will only - need enough space to hold one timestep of data; one record. */ - float **pres_out; /* [NLVL/nprocs][NLAT][NLON] */ - float **temp_out; /* [NLVL/nprocs][NLAT][NLON] */ - - /* These program variables hold the latitudes and longitudes. */ - float lats[NLAT], lons[NLON]; - - /* Loop indexes. */ - int lvl, lat, lon, rec, i = 0; - - /* Error handling. */ - int err; - - char *filename = FILE_NAME; - - MPI_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - if (argc > 3) { - if (!rank) - printf("Usage: %s [filename]\n", argv[0]); - MPI_Finalize(); - return 1; - } - - if (argc > 1) filename = argv[1]; - - if (argc > 2 && atoi(argv[2]) == 4) - format = NC_FORMAT_NETCDF4; - - if (rank == 0) { - char *cmd_str = (char *)malloc(strlen(argv[0]) + 256); - if (format == NC_FORMAT_NETCDF4) - sprintf(cmd_str, "*** TESTING C %s for writing NetCDF-4 file", basename(argv[0])); - else - sprintf(cmd_str, "*** TESTING C %s for writing classic file", basename(argv[0])); - printf("%-66s ------ ", cmd_str); - free(cmd_str); - } - - /* Create some pretend data. If this wasn't an example program, we - * would have some real data to write, for example, model - * output. - */ - for (lat = 0; lat < NLAT; lat++) - lats[lat] = START_LAT + 5. * lat; - for (lon = 0; lon < NLON; lon++) - lons[lon] = START_LON + 5. * lon; - - /* Set format. */ - err = ncmpi_set_default_format(format, NULL); - CHECK_ERR - - /* Create the file. */ - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, MPI_INFO_NULL, &ncid); - CHECK_ERR - - /* Define the dimensions. The record dimension is defined to have - * unlimited length - it can grow as needed. In this example it is - * the time dimension.*/ - err = ncmpi_def_dim(ncid, LVL_NAME, NLVL, &lvl_dimid); - CHECK_ERR - err = ncmpi_def_dim(ncid, LAT_NAME, NLAT, &lat_dimid); - CHECK_ERR - err = ncmpi_def_dim(ncid, LON_NAME, NLON, &lon_dimid); - CHECK_ERR - err = ncmpi_def_dim(ncid, REC_NAME, NC_UNLIMITED, &rec_dimid); - CHECK_ERR - - /* Define the coordinate variables. We will only define coordinate - variables for lat and lon. Ordinarily we would need to provide - an array of dimension IDs for each variable's dimensions, but - since coordinate variables only have one dimension, we can - simply provide the address of that dimension ID (&lat_dimid) and - similarly for (&lon_dimid). */ - err = ncmpi_def_var(ncid, LAT_NAME, NC_FLOAT, 1, &lat_dimid, &lat_varid); - CHECK_ERR - err = ncmpi_def_var(ncid, LON_NAME, NC_FLOAT, 1, &lon_dimid, &lon_varid); - CHECK_ERR - - /* Assign units attributes to coordinate variables. */ - err = ncmpi_put_att_text(ncid, lat_varid, UNITS, - strlen(DEGREES_NORTH), DEGREES_NORTH); - CHECK_ERR - err = ncmpi_put_att_text(ncid, lon_varid, UNITS, - strlen(DEGREES_EAST), DEGREES_EAST); - CHECK_ERR - - /* The dimids array is used to pass the dimids of the dimensions of - the netCDF variables. Both of the netCDF variables we are - creating share the same four dimensions. In C, the - unlimited dimension must come first on the list of dimids. */ - dimids[0] = rec_dimid; - dimids[1] = lvl_dimid; - dimids[2] = lat_dimid; - dimids[3] = lon_dimid; - - /* Define the netCDF variables for the pressure and temperature - * data. */ - err = ncmpi_def_var(ncid, PRES_NAME, NC_FLOAT, NDIMS, dimids, &pres_varid); - CHECK_ERR - err = ncmpi_def_var(ncid, TEMP_NAME, NC_FLOAT, NDIMS, dimids, &temp_varid); - CHECK_ERR - - /* Assign units attributes to the netCDF variables. */ - err = ncmpi_put_att_text(ncid, pres_varid, UNITS, - strlen(PRES_UNITS), PRES_UNITS); - CHECK_ERR - err = ncmpi_put_att_text(ncid, temp_varid, UNITS, - strlen(TEMP_UNITS), TEMP_UNITS); - CHECK_ERR - - /* End define mode. */ - err = ncmpi_enddef(ncid); - CHECK_ERR - - err = ncmpi_begin_indep_data(ncid); - /* Write the coordinate variable data. This will put the latitudes - and longitudes of our data grid into the netCDF file. */ - if (rank == 0) { - err = ncmpi_put_var_float(ncid, lat_varid, &lats[0]); - CHECK_ERR - err = ncmpi_put_var_float(ncid, lon_varid, &lons[0]); - CHECK_ERR - } - err = ncmpi_end_indep_data(ncid); - CHECK_ERR - - /* These settings tell netcdf to write one timestep of data. (The - setting of start[0] inside the loop below tells netCDF which - &data[0][0][0]); - timestep to write.) */ - count[0] = 1; - count[2] = NLAT; - count[3] = NLON; - start[2] = 0; - start[3] = 0; - - /* divide NLVL dimension among processes */ - count[1] = NLVL / nprocs; - start[1] = count[1] * rank; - if (rank < NLVL % nprocs) { - start[1] += rank; - count[1]++; - } - else { - start[1] += NLVL % nprocs; - } - if (count[1] == 0) - start[1] = 0; - - /* allocate write buffers */ - pres_out = (float **)malloc(sizeof(float *) * count[1] * 2); - temp_out = pres_out + count[1]; - if (count[1] > 0) { - pres_out[0] = (float *)malloc(sizeof(float) * count[1] * 2 * NLAT * NLON); - temp_out[0] = pres_out[0] + count[1] * NLAT * NLON; - for (i = 1; i < count[1]; i++) { - pres_out[i] = pres_out[i - 1] + NLAT * NLON; - temp_out[i] = temp_out[i - 1] + NLAT * NLON; - } - } - - /* initialize write buffers */ - i = (int)start[1] * NLAT * NLON; - for (lvl = 0; lvl < count[1]; lvl++) - for (lat = 0; lat < NLAT; lat++) - for (lon = 0; lon < NLON; lon++) { - pres_out[lvl][lat * NLON + lon] = SAMPLE_PRESSURE + i; - temp_out[lvl][lat * NLON + lon] = SAMPLE_TEMP + i++; - } - - /* Write the pretend data. This will write our surface pressure and - surface temperature data. The arrays only hold one timestep worth - of data. We will just rewrite the same data for each timestep. In - a real application, the data would change between timesteps. */ - - for (rec = 0; rec < NREC; rec++) { - start[0] = rec; - err = ncmpi_put_vara_float_all(ncid, pres_varid, start, count, &pres_out[0][0]); - CHECK_ERR - err = ncmpi_put_vara_float_all(ncid, temp_varid, start, count, &temp_out[0][0]); - CHECK_ERR - } - - /* Close the file. */ - err = ncmpi_close(ncid); - CHECK_ERR - - if (count[1] > 0) - free(pres_out[0]); - free(pres_out); - - /* check if there is any malloc residue */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } - - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) - printf(FAIL_STR, nerrs); - else - printf(PASS_STR); - } - - MPI_Finalize(); - - return (nerrs > 0); -} diff --git a/test/C/pres_temp_4D_wr_rd.c b/test/C/pres_temp_4D_wr_rd.c new file mode 100644 index 0000000000..1433046916 --- /dev/null +++ b/test/C/pres_temp_4D_wr_rd.c @@ -0,0 +1,486 @@ +/* + * Copyright (C) 2003, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +/* + This is an example program which writes some 4D pressure and + temperatures. It is intended to illustrate the use of the netCDF + C API. The companion program pres_temp_4D_rd.c shows how + to read the netCDF data file created by this program. + + This program is part of the netCDF tutorial: + http://www.unidata.ucar.edu/software/netcdf/docs/netcdf-tutorial + + Full documentation of the netCDF C API can be found at: + http://www.unidata.ucar.edu/software/netcdf/docs/netcdf-c +*/ + +#include +#include +#include + +#include +#include +#include + +/* We are writing and reading 4D data, a 2 x 6 x 12 lvl-lat-lon grid, with 2 + * timesteps of data. */ +#define NDIMS 4 +#define NLAT 6 +#define NLON 12 +#define LAT_NAME "latitude" +#define LON_NAME "longitude" +#define NREC 2 +#define REC_NAME "time" +#define LVL_NAME "level" +#define NLVL 512 + +/* Names of things. */ +#define PRES_NAME "pressure" +#define TEMP_NAME "temperature" +#define UNITS "units" +#define DEGREES_EAST "degrees_east" +#define DEGREES_NORTH "degrees_north" + +/* These are used to construct some example data, and to calculate the values + * we expect to find. */ +#define SAMPLE_PRESSURE 900.0 +#define SAMPLE_TEMP 9.0 +#define START_LAT 25.0 +#define START_LON -125.0 + +/* For the units attributes. */ +#define UNITS "units" +#define PRES_UNITS "hPa" +#define TEMP_UNITS "celsius" +#define LAT_UNITS "degrees_north" +#define LON_UNITS "degrees_east" +#define MAX_ATT_LEN 80 + +/* This is the name of the data file we will create and read back. */ +#define FILE_NAME "pres_temp_4D.nc" + + +/*----< pres_temp_4D_wr() >--------------------------------------------------*/ +int pres_temp_4D_wr(const char *filename, + int format, + int coll_io, + MPI_Info info) +{ + /* IDs for the netCDF file, dimensions, and variables. */ + int nprocs, rank, nerrs = 0, ncid; + int lon_dimid, lat_dimid, lvl_dimid, rec_dimid; + int lat_varid, lon_varid, pres_varid, temp_varid; + int dimids[NDIMS]; + + /* The start and count arrays will tell the netCDF library where to + write our data. */ + MPI_Offset start[NDIMS], count[NDIMS]; + + /* Program variables to hold the data we will write out. We will only + need enough space to hold one timestep of data; one record. */ + float **pres_out; /* [NLVL/nprocs][NLAT][NLON] */ + float **temp_out; /* [NLVL/nprocs][NLAT][NLON] */ + + /* These program variables hold the latitudes and longitudes. */ + float lats[NLAT], lons[NLON]; + + /* Loop indexes. */ + int lvl, lat, lon, rec, i = 0; + + /* Error handling. */ + int err; + + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + /* Create some pretend data. If this wasn't an example program, we + * would have some real data to write, for example, model + * output. + */ + for (lat = 0; lat < NLAT; lat++) + lats[lat] = START_LAT + 5. * lat; + for (lon = 0; lon < NLON; lon++) + lons[lon] = START_LON + 5. * lon; + + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + /* Create the file. */ + err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, info, &ncid); + CHECK_ERR + + /* Define the dimensions. The record dimension is defined to have + * unlimited length - it can grow as needed. In this example it is + * the time dimension.*/ + err = ncmpi_def_dim(ncid, LVL_NAME, NLVL, &lvl_dimid); + CHECK_ERR + err = ncmpi_def_dim(ncid, LAT_NAME, NLAT, &lat_dimid); + CHECK_ERR + err = ncmpi_def_dim(ncid, LON_NAME, NLON, &lon_dimid); + CHECK_ERR + err = ncmpi_def_dim(ncid, REC_NAME, NC_UNLIMITED, &rec_dimid); + CHECK_ERR + + /* Define the coordinate variables. We will only define coordinate + variables for lat and lon. Ordinarily we would need to provide + an array of dimension IDs for each variable's dimensions, but + since coordinate variables only have one dimension, we can + simply provide the address of that dimension ID (&lat_dimid) and + similarly for (&lon_dimid). */ + err = ncmpi_def_var(ncid, LAT_NAME, NC_FLOAT, 1, &lat_dimid, &lat_varid); + CHECK_ERR + err = ncmpi_def_var(ncid, LON_NAME, NC_FLOAT, 1, &lon_dimid, &lon_varid); + CHECK_ERR + + /* Assign units attributes to coordinate variables. */ + err = ncmpi_put_att_text(ncid, lat_varid, UNITS, + strlen(DEGREES_NORTH), DEGREES_NORTH); + CHECK_ERR + err = ncmpi_put_att_text(ncid, lon_varid, UNITS, + strlen(DEGREES_EAST), DEGREES_EAST); + CHECK_ERR + + /* The dimids array is used to pass the dimids of the dimensions of + the netCDF variables. Both of the netCDF variables we are + creating share the same four dimensions. In C, the + unlimited dimension must come first on the list of dimids. */ + dimids[0] = rec_dimid; + dimids[1] = lvl_dimid; + dimids[2] = lat_dimid; + dimids[3] = lon_dimid; + + /* Define the netCDF variables for the pressure and temperature + * data. */ + err = ncmpi_def_var(ncid, PRES_NAME, NC_FLOAT, NDIMS, dimids, &pres_varid); + CHECK_ERR + err = ncmpi_def_var(ncid, TEMP_NAME, NC_FLOAT, NDIMS, dimids, &temp_varid); + CHECK_ERR + + /* Assign units attributes to the netCDF variables. */ + err = ncmpi_put_att_text(ncid, pres_varid, UNITS, + strlen(PRES_UNITS), PRES_UNITS); + CHECK_ERR + err = ncmpi_put_att_text(ncid, temp_varid, UNITS, + strlen(TEMP_UNITS), TEMP_UNITS); + CHECK_ERR + + /* End define mode. */ + err = ncmpi_enddef(ncid); + CHECK_ERR + + err = ncmpi_begin_indep_data(ncid); + /* Write the coordinate variable data. This will put the latitudes + and longitudes of our data grid into the netCDF file. */ + if (rank == 0) { + err = ncmpi_put_var_float(ncid, lat_varid, &lats[0]); + CHECK_ERR + err = ncmpi_put_var_float(ncid, lon_varid, &lons[0]); + CHECK_ERR + } + + if (coll_io) { + err = ncmpi_end_indep_data(ncid); + CHECK_ERR + } + + /* These settings tell netcdf to write one timestep of data. (The + setting of start[0] inside the loop below tells netCDF which + &data[0][0][0]); + timestep to write.) */ + count[0] = 1; + count[2] = NLAT; + count[3] = NLON; + start[2] = 0; + start[3] = 0; + + /* divide NLVL dimension among processes */ + count[1] = NLVL / nprocs; + start[1] = count[1] * rank; + if (rank < NLVL % nprocs) { + start[1] += rank; + count[1]++; + } + else { + start[1] += NLVL % nprocs; + } + if (count[1] == 0) + start[1] = 0; + + /* allocate write buffers */ + pres_out = (float **)malloc(sizeof(float *) * count[1] * 2); + temp_out = pres_out + count[1]; + if (count[1] > 0) { + pres_out[0] = (float *)malloc(sizeof(float) * count[1] * 2 * NLAT * NLON); + temp_out[0] = pres_out[0] + count[1] * NLAT * NLON; + for (i = 1; i < count[1]; i++) { + pres_out[i] = pres_out[i - 1] + NLAT * NLON; + temp_out[i] = temp_out[i - 1] + NLAT * NLON; + } + } + + /* initialize write buffers */ + i = (int)start[1] * NLAT * NLON; + for (lvl = 0; lvl < count[1]; lvl++) + for (lat = 0; lat < NLAT; lat++) + for (lon = 0; lon < NLON; lon++) { + pres_out[lvl][lat * NLON + lon] = SAMPLE_PRESSURE + i; + temp_out[lvl][lat * NLON + lon] = SAMPLE_TEMP + i++; + } + + /* Write the pretend data. This will write our surface pressure and + surface temperature data. The arrays only hold one timestep worth + of data. We will just rewrite the same data for each timestep. In + a real application, the data would change between timesteps. */ + + for (rec = 0; rec < NREC; rec++) { + start[0] = rec; + if (coll_io) + err = ncmpi_put_vara_float_all(ncid, pres_varid, start, count, &pres_out[0][0]); + else + err = ncmpi_put_vara_float(ncid, pres_varid, start, count, &pres_out[0][0]); + CHECK_ERR + if (coll_io) + err = ncmpi_put_vara_float_all(ncid, temp_varid, start, count, &temp_out[0][0]); + else + err = ncmpi_put_vara_float(ncid, temp_varid, start, count, &temp_out[0][0]); + CHECK_ERR + } + + /* Close the file. */ + err = ncmpi_close(ncid); + CHECK_ERR + + if (count[1] > 0) + free(pres_out[0]); + free(pres_out); + + return (nerrs > 0); +} + +/*----< pres_temp_4D_rd() >--------------------------------------------------*/ +int pres_temp_4D_rd(const char *filename, + int coll_io, + MPI_Info info) +{ + int rank, nprocs, ncid, pres_varid, temp_varid, lat_varid, lon_varid; + + /* The start and count arrays will tell the netCDF library where to + read our data. */ + MPI_Offset start[NDIMS], count[NDIMS]; + + /* Program variables to hold the data we will read. We will only + need enough space to hold one timestep of data; one record. */ + float **pres_in = NULL; /* [NLVL/nprocs][NLAT][NLON] */ + float **temp_in = NULL; /* [NLVL/nprocs][NLAT][NLON] */ + + /* These program variables hold the latitudes and longitudes. */ + float lats[NLAT], lons[NLON]; + + /* Loop indexes. */ + int lvl, lat, lon, rec, i = 0; + + /* Error handling. */ + int err, nerrs = 0; + + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + /* Open the file. */ + err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, info, &ncid); + if (err != NC_NOERR) { /* fatal error */ + if (rank == 0) + fprintf(stderr,"Error: failed to open file %s (%s)\n",filename,ncmpi_strerror(err)); + goto err_out; + } + + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + + /* Get the varids of the latitude and longitude coordinate variables. */ + err = ncmpi_inq_varid(ncid, LAT_NAME, &lat_varid); + CHECK_ERR + err = ncmpi_inq_varid(ncid, LON_NAME, &lon_varid); + CHECK_ERR + + /* Read the coordinate variable data. */ + memset(lats, 0, sizeof(float) * NLAT); + memset(lons, 0, sizeof(float) * NLON); + if (coll_io) { + err = ncmpi_get_var_float_all(ncid, lat_varid, &lats[0]); + CHECK_ERR + err = ncmpi_get_var_float_all(ncid, lon_varid, &lons[0]); + CHECK_ERR + } + else { + err = ncmpi_get_var_float(ncid, lat_varid, &lats[0]); + CHECK_ERR + err = ncmpi_get_var_float(ncid, lon_varid, &lons[0]); + CHECK_ERR + } + + /* Check the coordinate variable data. */ + for (lat = 0; lat < NLAT; lat++) { + float exp = START_LAT + 5. * lat; + if (lats[lat] != exp) { + printf("\nError at line %d in %s: %s[%d] expect %.1f but got %.1f\n", + __LINE__, __FILE__, LAT_NAME, lat, exp, lats[lat]); + nerrs++; + break; + } + } + for (lon = 0; lon < NLON; lon++) { + float exp = START_LON + 5. * lon; + if (lons[lon] != START_LON + 5. * lon) { + printf("\nError at line %d in %s: %s[%d] expect %.1f but got %.1f\n", + __LINE__, __FILE__, LON_NAME, lon, exp, lons[lon]); + nerrs++; + break; + } + } + + /* Get the varids of the pressure and temperature netCDF variables. */ + err = ncmpi_inq_varid(ncid, PRES_NAME, &pres_varid); + CHECK_ERR + err = ncmpi_inq_varid(ncid, TEMP_NAME, &temp_varid); + CHECK_ERR + + /* Read the data. Since we know the contents of the file we know that the + * data arrays in this program are the correct size to hold one timestep. + */ + count[0] = 1; + count[2] = NLAT; + count[3] = NLON; + start[2] = 0; + start[3] = 0; + + /* divide NLVL dimension among processes */ + count[1] = NLVL / nprocs; + start[1] = count[1] * rank; + if (rank < NLVL % nprocs) { + start[1] += rank; + count[1]++; + } + else { + start[1] += NLVL % nprocs; + } + if (count[1] == 0) + start[1] = 0; + + /* allocate read buffers */ + pres_in = (float **)malloc(sizeof(float *) * count[1] * 2); + temp_in = pres_in + count[1]; + if (count[1] > 0) { + pres_in[0] = (float *)malloc(sizeof(float) * count[1] * 2 * NLAT * NLON); + temp_in[0] = pres_in[0] + count[1] * NLAT * NLON; + for (i = 1; i < count[1]; i++) { + pres_in[i] = pres_in[i - 1] + NLAT * NLON; + temp_in[i] = temp_in[i - 1] + NLAT * NLON; + } + } + + /* Read and check one record at a time. */ + for (rec = 0; rec < NREC; rec++) { + start[0] = rec; + if (coll_io) + err = ncmpi_get_vara_float_all(ncid, pres_varid, start, count, &pres_in[0][0]); + else + err = ncmpi_get_vara_float(ncid, pres_varid, start, count, &pres_in[0][0]); + CHECK_ERR + if (coll_io) + err = ncmpi_get_vara_float_all(ncid, temp_varid, start, count, &temp_in[0][0]); + else + err = ncmpi_get_vara_float(ncid, temp_varid, start, count, &temp_in[0][0]); + CHECK_ERR + + /* Check the data. */ + i = (int)start[1] * NLAT * NLON; + for (lvl = 0; lvl < count[1]; lvl++) + for (lat = 0; lat < NLAT; lat++) + for (lon = 0; lon < NLON; lon++) { + float exp = SAMPLE_PRESSURE + i; + int indx = lat * NLON + lon; + if (pres_in[lvl][indx] != exp) { + printf("\nError at line %d in %s: %s[%d][%d][%d][%d] expect %.1f but got %.1f\n", + __LINE__, __FILE__, PRES_NAME, rec, lvl, lat, lon, exp, pres_in[lvl][indx]); + nerrs++; + goto loop_exit; + } + exp = SAMPLE_TEMP + i; + if (temp_in[lvl][indx] != exp) { + printf("\nError at line %d in %s: %s[%d][%d][%d][%d] expect %.1f but got %.1f\n", + __LINE__, __FILE__, TEMP_NAME, rec, lvl, lat, lon, exp, temp_in[lvl][indx]); + nerrs++; + goto loop_exit; + } + i++; + } + +loop_exit: + MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); + if (nerrs > 0) break; + } /* next record */ + + /* Close the file. */ + err = ncmpi_close(ncid); + CHECK_ERR + + if (pres_in != NULL) { + if (pres_in[0] != NULL) + free(pres_in[0]); + free(pres_in); + } + +err_out: + return nerrs; +} + +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ + int nerrs; + + nerrs = pres_temp_4D_wr(out_path, format, coll_io, info); + if (nerrs > 0) return nerrs; + + MPI_Barrier(MPI_COMM_WORLD); + + nerrs = pres_temp_4D_rd(out_path, coll_io, info); + if (nerrs > 0) return nerrs; + + return 0; +} + +int main(int argc, char **argv) { + + int err; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 1; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "write/read netCDF file", opt, test_io); + + MPI_Finalize(); + + return err; +} + diff --git a/test/C/seq_runs.sh b/test/C/seq_runs.sh index 1098e28969..205d1fb396 100755 --- a/test/C/seq_runs.sh +++ b/test/C/seq_runs.sh @@ -1,58 +1,44 @@ -#!/bin/sh +#!/bin/bash # -# Copyright (C) 2003, Northwestern University and Argonne National Laboratory +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. # # Exit immediately if a command exits with a non-zero status. set -e -VALIDATOR=../../src/utils/ncvalidator/ncvalidator -NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff +DRY_RUN=no +VERBOSE=no -# remove file system type prefix if there is any -OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $TESTSEQRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $TESTSEQRUN $@ + fi +} -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" fi +exe_name=`basename $1` + # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS for j in ${safe_modes} ; do - export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" - ${TESTSEQRUN} ./pres_temp_4D_wr ${TESTOUTDIR}/pres_temp_4D.nc - ${TESTSEQRUN} ./pres_temp_4D_rd ${TESTOUTDIR}/pres_temp_4D.nc - # echo "--- validating file ${TESTOUTDIR}/pres_temp_4D.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/pres_temp_4D.nc - # echo "" - - if test "x${ENABLE_BURST_BUFFER}" = x1 ; then - echo "" - echo "---- testing burst buffering" - export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" - ${TESTSEQRUN} ./pres_temp_4D_wr ${TESTOUTDIR}/pres_temp_4D.bb.nc - ${TESTSEQRUN} ./pres_temp_4D_rd ${TESTOUTDIR}/pres_temp_4D.bb.nc - unset PNETCDF_HINTS - # echo "--- validating file ${TESTOUTDIR}/pres_temp_4D.bb.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/pres_temp_4D.bb.nc - # echo "" - - # echo "--- ncmpidiff pres_temp_4D.nc pres_temp_4D.bb.nc ---" - ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/pres_temp_4D.nc ${TESTOUTDIR}/pres_temp_4D.bb.nc - fi - # echo "" - - if test "${ENABLE_NETCDF4}" = 1 ; then - ${TESTSEQRUN} ./pres_temp_4D_wr ${TESTOUTDIR}/pres_temp_4D.nc4 4 - ${TESTSEQRUN} ./pres_temp_4D_rd ${TESTOUTDIR}/pres_temp_4D.nc4 - # Validator does not support nc4 - fi - # echo "" -done -rm -f ${OUTDIR}/*.nc -rm -f ${OUTDIR}/*.nc4 + + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi + + run_cmd ./$1 -q -o ${TESTOUTDIR}/${exe_name}.nc + +done # safe_modes + diff --git a/test/CXX/Makefile.am b/test/CXX/Makefile.am index 532133e059..4c8119c41f 100644 --- a/test/CXX/Makefile.am +++ b/test/CXX/Makefile.am @@ -22,10 +22,8 @@ if DECL_MPI_OFFSET AM_CPPFLAGS += -DHAVE_DECL_MPI_OFFSET endif -TESTPROGRAMS = nctst \ - test_classic - -check_PROGRAMS = $(TESTPROGRAMS) +check_PROGRAMS = nctst \ + test_classic # autimake 1.11.3 has not yet implemented AM_TESTS_ENVIRONMENT # For newer versions, we can use AM_TESTS_ENVIRONMENT instead @@ -34,28 +32,37 @@ TESTS_ENVIRONMENT += export srcdir="$(srcdir)"; TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; TESTS_ENVIRONMENT += export TESTMPIRUN="$(TESTMPIRUN)"; -TESTS_ENVIRONMENT += export PNETCDF_DEBUG="$(PNETCDF_DEBUG)"; -TESTS_ENVIRONMENT += export TESTPROGRAMS="$(TESTPROGRAMS)"; TESTS_ENVIRONMENT += export check_PROGRAMS="$(check_PROGRAMS)"; -TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER="$(ENABLE_BURST_BUFFER)"; -TESTS_ENVIRONMENT += export ENABLE_NETCDF4="$(ENABLE_NETCDF4)"; -TESTS = $(TESTPROGRAMS) -LOG_COMPILER = $(srcdir)/wrap_runs.sh +if PNETCDF_DEBUG + TESTS_ENVIRONMENT += export PNETCDF_DEBUG=1; +endif +if ENABLE_BURST_BUFFER + TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER=1; +endif +if ENABLE_NETCDF4 + TESTS_ENVIRONMENT += export ENABLE_NETCDF4=1; +endif +if ENABLE_THREAD_SAFE + TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE=1; +endif +if MIMIC_LUSTRE + TESTS_ENVIRONMENT += export MIMIC_LUSTRE=1; +endif -EXTRA_DIST = wrap_runs.sh parallel_run.sh +TESTS = $(check_PROGRAMS) +LOG_COMPILER = $(srcdir)/seq_runs.sh -NC_FILES = $(TESTPROGRAMS:%=$(TESTOUTDIR)/%.nc) \ - $(TESTPROGRAMS:%=$(TESTOUTDIR)/%.bb.nc) +EXTRA_DIST = seq_runs.sh parallel_run.sh -CLEANFILES = $(NC_FILES) \ +CLEANFILES = $(TESTOUTDIR)/*.nc $(TESTOUTDIR)/*bb \ core core.* *.gcda *.gcno *.gcov gmon.out a.out ../common/libtestutils.la: set -e; cd ../common && $(MAKE) $(MFLAGS) tests -ptest ptests ptest4: $(TESTPROGRAMS) +ptest ptests ptest4: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 4 MPI processes" @echo "===========================================================" diff --git a/test/CXX/nctst.cpp b/test/CXX/nctst.cpp index 84cd1a4add..5ab07ab717 100644 --- a/test/CXX/nctst.cpp +++ b/test/CXX/nctst.cpp @@ -362,7 +362,7 @@ cdl_name(const char* path) while (*(cp-1) != '/' && cp != path) // assumes UNIX path separator cp--; - static char np[NC_MAX_NAME]; + static char np[NC_MAX_NAME+1]; strncpy(&np[0], cp, NC_MAX_NAME); char* ep = np + strlen(np); @@ -509,12 +509,16 @@ main(int argc, char* argv[]) // test new netCDF interface { char filename[256]; int rank, nprocs; + double timing; MPI_Init(&argc, &argv); + + timing = MPI_Wtime(); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { + if (argc < 2) { if (!rank) printf("Usage: %s [filename]\n",argv[0]); MPI_Finalize(); return 1; @@ -525,8 +529,8 @@ main(int argc, char* argv[]) // test new netCDF interface if (rank == 0) { std::ostringstream cmd_str; cmd_str << "*** TESTING C++ " << basename(argv[0]) << - " for APIs with different netCDF formats"; - printf("%-66s ------ ", cmd_str.str().c_str()); + " - APIs with different netCDF formats"; + printf("%-63s -- ", cmd_str.str().c_str()); } // Set up the format constants. @@ -568,13 +572,16 @@ main(int argc, char* argv[]) // test new netCDF interface if (err == NC_NOERR) { MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", + printf("heap memory allocated by PnetCDF internally has " OFFFMT " bytes yet to be freed\n", sum_size); } + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); + if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf(PASS_STR, timing); } MPI_Finalize(); diff --git a/test/CXX/parallel_run.sh b/test/CXX/parallel_run.sh index 4f887ad275..db42d4f27a 100755 --- a/test/CXX/parallel_run.sh +++ b/test/CXX/parallel_run.sh @@ -1,24 +1,30 @@ #!/bin/bash # -# Copyright (C) 2018, Northwestern University and Argonne National Laboratory +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. # # Exit immediately if a command exits with a non-zero status. set -e -VALIDATOR=../../src/utils/ncvalidator/ncvalidator -NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff +DRY_RUN=no +VERBOSE=no -# remove file system type prefix if there is any -OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $MPIRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $MPIRUN $@ + fi +} MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "MPIRUN = ${MPIRUN}" -# echo "TESTPROGRAMS=${TESTPROGRAMS}" +# echo "check_PROGRAMS=${check_PROGRAMS}" -# echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -27,47 +33,23 @@ fi # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS -for i in ${TESTPROGRAMS} ; do - for j in ${safe_modes} ; do - for intra_aggr in 0 1 ; do - if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" - else - export PNETCDF_HINTS= - fi - if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" - fi - export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc +if test "x$MIMIC_LUSTRE" != x1 ; then + PNETCDF_HINTS="cb_nodes=2" +fi + +for i in ${check_PROGRAMS} ; do - # echo "--- validating file ${TESTOUTDIR}/$i.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc - # echo "" + exe_name=`basename $i` - if test "x${ENABLE_BURST_BUFFER}" = x1 ; then - # echo "---- test burst buffering feature" - saved_PNETCDF_HINTS=${PNETCDF_HINTS} - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.bb.nc - export PNETCDF_HINTS=${saved_PNETCDF_HINTS} + for j in ${safe_modes} ; do - # echo "--- validating file ${TESTOUTDIR}/$i.bb.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.nc + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi - # echo "--- ncmpidiff $i.nc $i.bb.nc ---" - ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.nc ${TESTOUTDIR}/$i.bb.nc - fi + run_cmd ./$i -q ${TESTOUTDIR}/${exe_name}.nc - if test "x${ENABLE_NETCDF4}" = x1 ; then - # echo "test netCDF-4 feature" - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc - # Validator does not support NetCDF-4 format - fi - done - done - rm -f ${OUTDIR}/$i.nc - rm -f ${OUTDIR}/$i.bb.nc -done + done # safe_modes +done # check_PROGRAMS diff --git a/test/CXX/seq_runs.sh b/test/CXX/seq_runs.sh new file mode 100755 index 0000000000..98dbf64a60 --- /dev/null +++ b/test/CXX/seq_runs.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory +# See COPYRIGHT notice in top-level directory. +# + +# Exit immediately if a command exits with a non-zero status. +set -e + +DRY_RUN=no +VERBOSE=no + +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $TESTSEQRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $TESTSEQRUN $@ + fi +} + +if test "x${PNETCDF_DEBUG}" = x1 ; then + safe_modes="0 1" +else + safe_modes="0" +fi + +exe_name=`basename $1` + +# prevent user environment setting of PNETCDF_HINTS to interfere +unset PNETCDF_HINTS + +for j in ${safe_modes} ; do + + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi + + run_cmd ./$1 ${TESTOUTDIR}/${exe_name}.nc + +done # safe_modes + diff --git a/test/CXX/test_classic.cpp b/test/CXX/test_classic.cpp index c600181af9..5ebfc5e87c 100644 --- a/test/CXX/test_classic.cpp +++ b/test/CXX/test_classic.cpp @@ -15,10 +15,14 @@ int main( int argc, char *argv[] ) { char filename[256]; int rank, nerrs=0, verbose=0; + double timing; MPI_Init(&argc, &argv); + + timing = MPI_Wtime(); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (argc > 2) { + if (argc < 2) { if (!rank) printf("Usage: %s [filename]\n",argv[0]); MPI_Finalize(); return 1; @@ -32,8 +36,8 @@ int main( int argc, char *argv[] ) std::ostringstream cmd_str; cmd_str << "*** TESTING C++ " << basename(argv[0]) << - " for creation of classic format file"; - printf("%-66s ------ ", cmd_str.str().c_str()); + " - creation of classic format file"; + printf("%-63s -- ", cmd_str.str().c_str()); } try @@ -88,13 +92,16 @@ int main( int argc, char *argv[] ) if (err == NC_NOERR) { MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", + printf("heap memory allocated by PnetCDF internally has " OFFFMT " bytes yet to be freed\n", sum_size); } + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); + if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf(PASS_STR, timing); } MPI_Finalize(); diff --git a/test/CXX/wrap_runs.sh b/test/CXX/wrap_runs.sh index d34ca7b47f..18eb1624c8 100755 --- a/test/CXX/wrap_runs.sh +++ b/test/CXX/wrap_runs.sh @@ -16,7 +16,7 @@ outfile=`basename $1` OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -26,8 +26,30 @@ fi unset PNETCDF_HINTS for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi +for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + else + USEMPIO_HINTS="nc_pncio=enable" + fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + ${TESTSEQRUN} ./$1 ${TESTOUTDIR}/$outfile.nc ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.nc # echo "" @@ -36,7 +58,7 @@ for j in ${safe_modes} ; do echo "" echo "---- testing burst buffering" - export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" + export PNETCDF_HINTS="$PNETCDF_HINTS;nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.bb.nc unset PNETCDF_HINTS ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.bb.nc @@ -45,6 +67,7 @@ for j in ${safe_modes} ; do ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$outfile.nc ${TESTOUTDIR}/$outfile.bb.nc fi done +done rm -f ${OUTDIR}/$outfile.nc rm -f ${OUTDIR}/$outfile.bb.nc diff --git a/test/F90/Makefile.am b/test/F90/Makefile.am index 4d48127679..1724ee0092 100644 --- a/test/F90/Makefile.am +++ b/test/F90/Makefile.am @@ -28,29 +28,25 @@ if NAGFORT AM_FCFLAGS += -w=uparam endif -TESTPROGRAMS = tst_f90 \ - f90tst_vars \ - tst_types2 \ - tst_f90_cdf5 \ - f90tst_vars2 \ - f90tst_vars3 \ - f90tst_vars4 \ - test_intent \ - test_attr_int64 \ - test_fill - -PARALLEL_PROGS = f90tst_parallel \ +check_PROGRAMS = tst_f90 \ + f90tst_vars \ + tst_types2 \ + tst_f90_cdf5 \ + f90tst_vars2 \ + f90tst_vars3 \ + f90tst_vars4 \ + test_intent \ + test_attr_int64 \ + test_fill \ + f90tst_parallel \ f90tst_parallel2 \ f90tst_parallel3 \ - f90tst_parallel4 - -check_PROGRAMS = $(TESTPROGRAMS) \ - $(PARALLEL_PROGS) \ + f90tst_parallel4 \ tst_io # autimake 1.11.3 has not yet implemented AM_TESTS_ENVIRONMENT # For newer versions, we can use AM_TESTS_ENVIRONMENT instead -# AM_TESTS_ENVIRONMENT = TESTPROGRAMS="$(TESTPROGRAMS)" ; export TESTPROGRAMS; +# AM_TESTS_ENVIRONMENT = check_PROGRAMS="$(check_PROGRAMS)" ; export check_PROGRAMS; # AM_TESTS_ENVIRONMENT += TESTSEQRUN="$(TESTSEQRUN)" ; export TESTSEQRUN; # AM_TESTS_ENVIRONMENT += TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)" ; export TESTOUTDIR; TESTS_ENVIRONMENT = export SED="$(SED)"; @@ -58,59 +54,70 @@ TESTS_ENVIRONMENT += export srcdir="$(srcdir)"; TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; TESTS_ENVIRONMENT += export TESTMPIRUN="$(TESTMPIRUN)"; -TESTS_ENVIRONMENT += export PNETCDF_DEBUG="$(PNETCDF_DEBUG)"; -TESTS_ENVIRONMENT += export TESTPROGRAMS="$(TESTPROGRAMS)"; TESTS_ENVIRONMENT += export check_PROGRAMS="$(check_PROGRAMS)"; -TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER="$(ENABLE_BURST_BUFFER)"; -TESTS_ENVIRONMENT += export PARALLEL_PROGS="$(PARALLEL_PROGS)"; -TESTS = $(TESTPROGRAMS) seq_runs.sh +if PNETCDF_DEBUG + TESTS_ENVIRONMENT += export PNETCDF_DEBUG=1; +endif +if ENABLE_BURST_BUFFER + TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER=1; +endif +if ENABLE_THREAD_SAFE + TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE=1; +endif +if MIMIC_LUSTRE + TESTS_ENVIRONMENT += export MIMIC_LUSTRE=1; +endif + +TESTS = $(check_PROGRAMS) TEST_EXTENSIONS = .sh -LOG_COMPILER = $(srcdir)/wrap_runs.sh +LOG_COMPILER = $(srcdir)/seq_runs.sh SH_LOG_COMPILER = EXTRA_DIST = seq_runs.sh wrap_runs.sh parallel_run.sh -NC_FILES = $(check_PROGRAMS:%=$(TESTOUTDIR)/%.nc) \ - $(check_PROGRAMS:%=$(TESTOUTDIR)/%.bb.nc) - CLEANFILES = *.$(FC_MODEXT) core core.* *.gcda *.gcno *.gcov gmon.out \ - $(TESTOUTDIR)/tst_io1.nc $(TESTOUTDIR)/tst_io1.nc0 \ - $(TESTOUTDIR)/testfile.nc $(NC_FILES) + $(TESTOUTDIR)/*.nc $(TESTOUTDIR)/*bb ../common/libtestutils.la: set -e; cd ../common && $(MAKE) $(MFLAGS) tests -ptest ptest4: $(PARALLEL_PROGS) +ptest ptest4: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 4 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ $(srcdir)/parallel_run.sh 4 || exit 1 -ptest2: $(PARALLEL_PROGS) +ptest2: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 2 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ $(srcdir)/parallel_run.sh 2 || exit 1 -ptest8: $(PARALLEL_PROGS) +ptest6: $(check_PROGRAMS) + @echo "===========================================================" + @echo " $(subdir): Parallel testing on 6 MPI processes" + @echo "===========================================================" + @$(TESTS_ENVIRONMENT) \ + $(srcdir)/parallel_run.sh 6 || exit 1 + +ptest8: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 8 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ $(srcdir)/parallel_run.sh 8 || exit 1 -ptest10: $(PARALLEL_PROGS) +ptest10: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 10 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ $(srcdir)/parallel_run.sh 10 || exit 1 -ptests: ptest2 ptest4 ptest8 ptest10 -ptest6: +ptests: ptest4 ptest8 # build check targets but not invoke tests-local: all $(check_PROGRAMS) diff --git a/test/F90/f90tst_parallel.f90 b/test/F90/f90tst_parallel.f90 index d6223a334a..725e1aa976 100644 --- a/test/F90/f90tst_parallel.f90 +++ b/test/F90/f90tst_parallel.f90 @@ -56,28 +56,35 @@ program f90tst_parallel integer :: mode_flag integer :: nvars, ngatts, ndims, unlimdimid, file_format integer :: x, y - integer :: p, my_rank, err, ierr, get_args + integer :: nprocs, my_rank, err, ierr, get_args integer(KIND=MPI_OFFSET_KIND) :: start(MAX_DIMS), count(MAX_DIMS) integer(KIND=MPI_OFFSET_KIND) :: nx_ll, ny_ll - character(LEN=256) filename, cmd, msg + character(LEN=256) out_path, in_path, cmd, msg + logical keep_files + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_rank(MPI_COMM_WORLD, my_rank, ierr) - call MPI_Comm_size(MPI_COMM_WORLD, p, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any if (my_rank .EQ. 0) then - filename = FILE_NAME - err = get_args(cmd, filename) + out_path = FILE_NAME + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) -! if (p .ne. 4 .AND. my_rank .eq. 0) then -! print *, 'Warning: ',trim(cmd),' is design to run on 4 processes.' -! endif + ! if (nprocs .ne. 4 .AND. my_rank .eq. 0) then + ! print *, 'Warning: ',trim(cmd),' is design to run on 4 processes.' + ! endif ! Create some pretend data. do x = 1, NX / 2 @@ -88,7 +95,7 @@ program f90tst_parallel ! Create the netCDF file. mode_flag = IOR(NF90_CLOBBER, NF90_64BIT_DATA) - call handle_err(nf90mpi_create(MPI_COMM_WORLD, filename, mode_flag, MPI_INFO_NULL, ncid)) + call handle_err(nf90mpi_create(MPI_COMM_WORLD, out_path, mode_flag, MPI_INFO_NULL, ncid)) ! Define the dimensions. nx_ll = NX @@ -100,6 +107,9 @@ program f90tst_parallel ! Define the variable. call handle_err(nf90mpi_def_var(ncid, "data", NF90_INT, dimids, varid)) + ! fill with default fill value + call handle_err(nf90mpi_def_var_fill(ncid, varid, 0, NF90_FILL_INT)) + call handle_err(nf90mpi_enddef(ncid)) ! Determine what part of the variable will be written for this @@ -125,7 +135,7 @@ program f90tst_parallel call handle_err(nf90mpi_close(ncid)) ! Reopen the file. - call handle_err(nf90mpi_open(MPI_COMM_WORLD, filename, nf90_nowrite, MPI_INFO_NULL, ncid)) + call handle_err(nf90mpi_open(MPI_COMM_WORLD, out_path, nf90_nowrite, MPI_INFO_NULL, ncid)) ! Check some stuff out. call handle_err(nf90mpi_inquire(ncid, ndims, nvars, ngatts, unlimdimid, file_format)) @@ -150,12 +160,21 @@ program f90tst_parallel ! Close the file. call handle_err(nf90mpi_close(ncid)) + 999 timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) + if (my_rank .eq. 0) then - msg = '*** TESTING F90 '//trim(cmd) - call pass_fail(0, msg) + if (.NOT. keep_files) then + err = nf90mpi_delete(out_path, MPI_INFO_NULL) + end if + + msg = '*** TESTING F90 '//trim(cmd) + call pass_fail(0, msg, timing) endif - 999 call MPI_Finalize(ierr) + call MPI_Finalize(ierr) contains diff --git a/test/F90/f90tst_parallel2.f90 b/test/F90/f90tst_parallel2.f90 index b7e4a0e37c..51de6c579a 100644 --- a/test/F90/f90tst_parallel2.f90 +++ b/test/F90/f90tst_parallel2.f90 @@ -60,28 +60,35 @@ program f90tst_parallel integer :: mode_flag, old_fillmode integer :: nvars, ngatts, ndims, unlimdimid, file_format integer :: x, y - integer :: p, my_rank, err, ierr, get_args + integer :: nprocs, my_rank, err, ierr, get_args integer(KIND=MPI_OFFSET_KIND) :: start(MAX_DIMS), count(MAX_DIMS), stride(MAX_DIMS) integer(KIND=MPI_OFFSET_KIND) :: nx_ll, ny_ll - character(LEN=256) filename, cmd, msg + character(LEN=256) out_path, in_path, cmd, msg + logical keep_files + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_rank(MPI_COMM_WORLD, my_rank, ierr) - call MPI_Comm_size(MPI_COMM_WORLD, p, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any if (my_rank .EQ. 0) then - filename = FILE_NAME - err = get_args(cmd, filename) + out_path = FILE_NAME + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) -! if (p .ne. 4 .AND. my_rank .eq. 0) then -! print *, 'Warning: ',trim(cmd),' is design to run on 4 processes.' -! endif + ! if (nprocs .ne. 4 .AND. my_rank .eq. 0) then + ! print *, 'Warning: ',trim(cmd),' is design to run on 4 processes.' + ! endif ! Create some pretend data. do x = 1, NX / 4 @@ -92,7 +99,7 @@ program f90tst_parallel ! Create the netCDF file. mode_flag = IOR(NF90_CLOBBER, NF90_64BIT_DATA) - call handle_err(nf90mpi_create(MPI_COMM_WORLD, filename, mode_flag, MPI_INFO_NULL, ncid)) + call handle_err(nf90mpi_create(MPI_COMM_WORLD, out_path, mode_flag, MPI_INFO_NULL, ncid)) ! Define the dimensions. nx_ll = NX @@ -139,7 +146,7 @@ program f90tst_parallel call handle_err(nf90mpi_close(ncid)) ! Reopen the file. - call handle_err(nf90mpi_open(MPI_COMM_WORLD, filename, nf90_nowrite, MPI_INFO_NULL, ncid)) + call handle_err(nf90mpi_open(MPI_COMM_WORLD, out_path, nf90_nowrite, MPI_INFO_NULL, ncid)) ! Check some stuff out. call handle_err(nf90mpi_inquire(ncid, ndims, nvars, ngatts, unlimdimid, file_format)) @@ -165,12 +172,21 @@ program f90tst_parallel ! Close the file. call handle_err(nf90mpi_close(ncid)) + 999 timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) + if (my_rank .eq. 0) then - msg = '*** TESTING F90 '//trim(cmd)//' for strided access' - call pass_fail(0, msg) + if (.NOT. keep_files) then + err = nf90mpi_delete(out_path, MPI_INFO_NULL) + end if + + msg = '*** TESTING F90 '//trim(cmd)//' - strided access' + call pass_fail(0, msg, timing) endif - 999 call MPI_Finalize(ierr) + call MPI_Finalize(ierr) contains ! This subroutine handles errors by printing an error message and diff --git a/test/F90/f90tst_parallel3.f90 b/test/F90/f90tst_parallel3.f90 index 08bd615a0a..1bdba8884a 100644 --- a/test/F90/f90tst_parallel3.f90 +++ b/test/F90/f90tst_parallel3.f90 @@ -55,29 +55,36 @@ program f90tst_parallel3 integer (kind=EightByteInt) :: int64_out(HALF_NY, HALF_NX), int64_in(HALF_NY, HALF_NX) integer :: nvars, ngatts, ndims, unlimdimid, file_format integer :: x, y, v - integer :: p, my_rank, err, ierr, get_args, old_mode + integer :: nprocs, my_rank, err, ierr, get_args, old_mode integer(KIND=MPI_OFFSET_KIND) :: start(MAX_DIMS), count(MAX_DIMS) integer :: cmode integer(KIND=MPI_OFFSET_KIND) :: nx_ll, ny_ll - character(LEN=256) filename, cmd, msg + character(LEN=256) out_path, in_path, cmd, msg + logical keep_files + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_rank(MPI_COMM_WORLD, my_rank, ierr) - call MPI_Comm_size(MPI_COMM_WORLD, p, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any if (my_rank .EQ. 0) then - filename = FILE_NAME - err = get_args(cmd, filename) + out_path = FILE_NAME + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) -! if (p .ne. 4 .AND. my_rank .eq. 0) then -! print *, 'Warning: ',trim(cmd),' is design to run on 4 processes.' -! endif + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) + + ! if (nprocs .ne. 4 .AND. my_rank .eq. 0) then + ! print *, 'Warning: ',trim(cmd),' is design to run on 4 processes.' + ! endif ! Create some pretend data. do x = 1, HALF_NX @@ -93,7 +100,7 @@ program f90tst_parallel3 ! Create the netCDF file. cmode = IOR(NF90_CLOBBER, NF90_64BIT_DATA) - call check(nf90mpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, ncid)) + call check(nf90mpi_create(MPI_COMM_WORLD, out_path, cmode, MPI_INFO_NULL, ncid)) ! Define the dimensions. nx_ll = NX @@ -147,7 +154,7 @@ program f90tst_parallel3 call check(nf90mpi_close(ncid)) ! Reopen the file. - call check(nf90mpi_open(MPI_COMM_WORLD, filename, nf90_nowrite, MPI_INFO_NULL, ncid)) + call check(nf90mpi_open(MPI_COMM_WORLD, out_path, nf90_nowrite, MPI_INFO_NULL, ncid)) ! Check some stuff out. call check(nf90mpi_inquire(ncid, ndims, nvars, ngatts, unlimdimid, file_format)) @@ -183,12 +190,21 @@ program f90tst_parallel3 ! Close the file. call check(nf90mpi_close(ncid)) - if (my_rank .eq. 0) then - msg = '*** TESTING F90 '//trim(cmd) - call pass_fail(0, msg) - endif + 999 timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) + + if (my_rank .eq. 0) then + if (.NOT. keep_files) then + err = nf90mpi_delete(out_path, MPI_INFO_NULL) + end if + + msg = '*** TESTING F90 '//trim(cmd) + call pass_fail(0, msg, timing) + endif - 999 call MPI_Finalize(ierr) + call MPI_Finalize(ierr) contains ! This subroutine handles errors by printing an error message and diff --git a/test/F90/f90tst_parallel4.f90 b/test/F90/f90tst_parallel4.f90 index b545fa8474..c618d5acbe 100644 --- a/test/F90/f90tst_parallel4.f90 +++ b/test/F90/f90tst_parallel4.f90 @@ -17,38 +17,47 @@ program f90tst integer :: dimid(3) integer(KIND=MPI_OFFSET_KIND) :: start(3), count(3) real :: f(3) - character(LEN=256) filename, cmd, msg + character(LEN=256) out_path, in_path, cmd, msg + logical keep_files + double precision timing + + call MPI_Init(ierr) + + timing = MPI_Wtime() - call MPI_INIT(ierr) call MPI_COMM_RANK(MPI_COMM_WORLD, my_rank, ierr) call MPI_COMM_SIZE(MPI_COMM_WORLD, nprocs, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any if (my_rank .EQ. 0) then - filename = FILE_NAME - err = get_args(cmd, filename) + out_path = FILE_NAME + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) -! if (nprocs .ne. 8 .AND. my_rank .eq. 0) then -! print *, 'Warning: ',trim(cmd),' is design to run on 8 processes.' -! endif + ! if (nprocs .ne. 8 .AND. my_rank .eq. 0) then + ! print *, 'Warning: ',trim(cmd),' is design to run on 8 processes.' + ! endif nmode = IOR(NF90_CLOBBER,NF90_64BIT_DATA) - call handle_err(nf90mpi_create(MPI_COMM_WORLD, filename, nmode, MPI_INFO_NULL, fh)) + call handle_err(nf90mpi_create(MPI_COMM_WORLD, out_path, nmode, MPI_INFO_NULL, fh)) call handle_err(nf90mpi_def_dim(fh, 'dim1', 6_MPI_OFFSET_KIND, dimid(1))) call handle_err(nf90mpi_def_dim(fh, 'dim2', 4_MPI_OFFSET_KIND, dimid(2))) call handle_err(nf90mpi_def_dim(fh, 'dim3', 1_MPI_OFFSET_KIND, dimid(3))) - call handle_err(nf90mpi_def_var(fh, 'var1', NF90_DOUBLE, dimid, varid)) - call handle_err(nf90mpi_enddef(fh)) + ! fill with default fill value + call handle_err(nf90mpi_def_var_fill(fh, varid, 0, NF90_FILL_DOUBLE)) + + call handle_err(nf90mpi_enddef(fh)) do i=1,3 f(i) = my_rank*3+i @@ -68,7 +77,7 @@ program f90tst call handle_err(nf90mpi_close(fh)) ! Reopen the file and check it. - call handle_err(nf90mpi_open(MPI_COMM_WORLD, filename, NF90_NOWRITE, MPI_INFO_NULL, fh)) + call handle_err(nf90mpi_open(MPI_COMM_WORLD, out_path, NF90_NOWRITE, MPI_INFO_NULL, fh)) call handle_err(nf90mpi_get_var_all(fh, varid, f, start=start, count=count)) @@ -83,12 +92,21 @@ program f90tst call handle_err(nf90mpi_close(fh)) + 999 timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) + if (my_rank .eq. 0) then + if (.NOT. keep_files) then + err = nf90mpi_delete(out_path, MPI_INFO_NULL) + end if + msg = '*** TESTING F90 '//trim(cmd) - call pass_fail(0, msg) + call pass_fail(0, msg, timing) endif - 999 call MPI_Finalize(ierr) + call MPI_Finalize(ierr) contains ! This subroutine handles errors by printing an error message and diff --git a/test/F90/f90tst_vars.f90 b/test/F90/f90tst_vars.f90 index 7f43a69089..78faf601d0 100644 --- a/test/F90/f90tst_vars.f90 +++ b/test/F90/f90tst_vars.f90 @@ -31,24 +31,34 @@ program f90tst_vars integer, parameter :: CACHE_SIZE = 1000000 integer :: info, err, ierr, get_args integer(KIND=MPI_OFFSET_KIND) :: nx_ll, ny_ll - character(LEN=256) filename, cmd, msg - integer my_rank, p + character(LEN=256) out_path, in_path, cmd, msg + integer my_rank, nprocs + logical keep_files + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_rank(MPI_COMM_WORLD, my_rank, ierr) - call MPI_Comm_size(MPI_COMM_WORLD, p, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any + cmd = ' ' if (my_rank .EQ. 0) then - filename = FILE_NAME - err = get_args(cmd, filename) + out_path = FILE_NAME + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) + + if (my_rank > 0) goto 999; -! if (p .ne. 1 .AND. my_rank .eq. 0) then +! if (nprocs .ne. 1 .AND. my_rank .eq. 0) then ! print *, 'Warning: ',trim(cmd),' is design to run on 1 process' ! endif @@ -65,7 +75,7 @@ program f90tst_vars ! Create the netCDF file. mode_flag = IOR(NF90_CLOBBER, NF90_64BIT_DATA) - call handle_err(nf90mpi_create(MPI_COMM_WORLD, filename, mode_flag, info, ncid)) + call handle_err(nf90mpi_create(MPI_COMM_SELF, out_path, mode_flag, info, ncid)) call MPI_Info_free(info, ierr) ! Define the dimensions. @@ -78,6 +88,9 @@ program f90tst_vars ! Define the variable. call handle_err(nf90mpi_def_var(ncid, "data", NF90_INT, dimids, varid)) + ! fill with default fill value + call handle_err(nf90mpi_def_var_fill(ncid, varid, 0, NF90_FILL_INT)) + ! With classic model netCDF-4 file, enddef must be called. call handle_err(nf90mpi_enddef(ncid)) @@ -91,7 +104,7 @@ program f90tst_vars call handle_err(nf90mpi_close(ncid)) ! Reopen the file. - call handle_err(nf90mpi_open(MPI_COMM_WORLD, filename, nf90_nowrite, MPI_INFO_NULL, ncid)) + call handle_err(nf90mpi_open(MPI_COMM_SELF, out_path, nf90_nowrite, MPI_INFO_NULL, ncid)) ! Check some stuff out. call handle_err(nf90mpi_inquire(ncid, ndims, nvars, ngatts, unlimdimid, file_format)) @@ -116,10 +129,21 @@ program f90tst_vars ! Close the file. call handle_err(nf90mpi_close(ncid)) - msg = '*** TESTING F90 '//trim(cmd)//' for def_var API' - if (my_rank .eq. 0) call pass_fail(0, msg) + 999 timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) + + if (my_rank .eq. 0) then + if (.NOT. keep_files) then + err = nf90mpi_delete(out_path, MPI_INFO_NULL) + end if + + msg = '*** TESTING F90 '//trim(cmd)//' - def_var API' + call pass_fail(0, msg, timing) + end if - 999 call MPI_Finalize(ierr) + call MPI_Finalize(ierr) contains ! This subroutine handles errors by printing an error message and diff --git a/test/F90/f90tst_vars2.f90 b/test/F90/f90tst_vars2.f90 index d2011970ed..4ce95a111a 100644 --- a/test/F90/f90tst_vars2.f90 +++ b/test/F90/f90tst_vars2.f90 @@ -24,9 +24,8 @@ program f90tst_vars2 ! We need these ids and other gunk for netcdf. integer :: ncid, varid1, varid2, varid3, varid4, varid5, dimids(MAX_DIMS) - integer :: x_dimid, y_dimid + integer :: x, y, x_dimid, y_dimid, old_fillmode integer :: nvars, ngatts, ndims, unlimdimid, file_format - integer :: x, y integer, parameter :: DEFLATE_LEVEL = 4 integer, parameter :: EightByteInt = selected_int_kind(18) integer (kind = EightByteInt) :: TOE_SAN_VALUE = 2147483648_EightByteInt @@ -45,26 +44,36 @@ program f90tst_vars2 integer (kind = EightByteInt) :: toe_san_in(1) integer :: cmode, err, ierr, get_args integer(KIND=MPI_OFFSET_KIND) :: nx_ll, ny_ll - character(LEN=256) filename, cmd, msg - integer my_rank, p + character(LEN=256) out_path, in_path, cmd, msg + integer my_rank, nprocs + logical keep_files + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_rank(MPI_COMM_WORLD, my_rank, ierr) - call MPI_Comm_size(MPI_COMM_WORLD, p, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any + cmd = ' ' if (my_rank .EQ. 0) then - filename = FILE_NAME - err = get_args(cmd, filename) + out_path = FILE_NAME + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) + + if (my_rank > 0) goto 999 -! if (p .ne. 1 .AND. my_rank .eq. 0) then -! print *, 'Warning: ',trim(cmd),' is design to run on 1 process' -! endif + ! if (nprocs .ne. 1 .AND. my_rank .eq. 0) then + ! print *, 'Warning: ',trim(cmd),' is design to run on 1 process' + ! endif ! Create some pretend data. do x = 1, NX @@ -78,7 +87,7 @@ program f90tst_vars2 ! Create the netCDF file. cmode = IOR(NF90_CLOBBER, NF90_64BIT_DATA) - call check(nf90mpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, ncid)) + call check(nf90mpi_create(MPI_COMM_SELF, out_path, cmode, MPI_INFO_NULL, ncid)) ! Define the dimensions. nx_ll = NX @@ -94,6 +103,8 @@ program f90tst_vars2 call check(nf90mpi_def_var(ncid, VAR4_NAME, NF90_INT, x_dimid, varid4)) call check(nf90mpi_def_var(ncid, VAR5_NAME, NF90_INT, dimids, varid5)) + call check(nf90mpi_set_fill(ncid, NF90_FILL, old_fillmode)) + call check(nf90mpi_enddef(ncid)) ! enter independent data mode @@ -110,7 +121,7 @@ program f90tst_vars2 call check(nf90mpi_close(ncid)) ! Reopen the file. - call check(nf90mpi_open(MPI_COMM_WORLD, filename, nf90_nowrite, MPI_INFO_NULL, ncid)) + call check(nf90mpi_open(MPI_COMM_SELF, out_path, nf90_nowrite, MPI_INFO_NULL, ncid)) ! Check some stuff out. call check(nf90mpi_inquire(ncid, ndims, nvars, ngatts, unlimdimid, file_format)) @@ -173,10 +184,21 @@ program f90tst_vars2 ! Close the file. call check(nf90mpi_close(ncid)) - msg = '*** TESTING F90 '//trim(cmd)//' for def_var API' - if (my_rank .eq. 0) call pass_fail(0, msg) + 999 timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) + + if (my_rank .eq. 0) then + if (.NOT. keep_files) then + err = nf90mpi_delete(out_path, MPI_INFO_NULL) + end if + + msg = '*** TESTING F90 '//trim(cmd)//' - def_var API' + call pass_fail(0, msg, timing) + end if - 999 call MPI_Finalize(ierr) + call MPI_Finalize(ierr) contains ! This subroutine handles errors by printing an error message and diff --git a/test/F90/f90tst_vars3.f90 b/test/F90/f90tst_vars3.f90 index 88fadc5692..9edec5ffa5 100644 --- a/test/F90/f90tst_vars3.f90 +++ b/test/F90/f90tst_vars3.f90 @@ -24,9 +24,8 @@ program f90tst_vars3 ! We need these ids and other gunk for netcdf. integer :: ncid, varid1, varid2, varid3, varid4, varid5, dimids(MAX_DIMS) - integer :: x_dimid, y_dimid + integer :: x, y, x_dimid, y_dimid, old_fillmode integer :: nvars, ngatts, ndims, unlimdimid, file_format - integer :: x, y integer, parameter :: DEFAULT_CACHE_NELEMS = 10000, DEFAULT_CACHE_SIZE = 1000000 integer, parameter :: DEFAULT_CACHE_PREEMPTION = 22 integer, parameter :: DEFLATE_LEVEL = 4 @@ -46,26 +45,37 @@ program f90tst_vars3 integer (kind = EightByteInt) :: toe_san_in(1) integer :: cmode, err, ierr, get_args integer(KIND=MPI_OFFSET_KIND) :: nx_ll, ny_ll - character(LEN=256) filename, cmd, msg - integer my_rank, p + character(LEN=256) out_path, in_path, cmd, msg + integer my_rank, nprocs + logical keep_files + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_rank(MPI_COMM_WORLD, my_rank, ierr) - call MPI_Comm_size(MPI_COMM_WORLD, p, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any + cmd = ' ' if (my_rank .EQ. 0) then - filename = FILE_NAME - err = get_args(cmd, filename) + out_path = FILE_NAME + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) + + ! This program is design to run on 1 process' + if (my_rank > 0) goto 999 -! if (p .ne. 1 .AND. my_rank .eq. 0) then -! print *, 'Warning: ',trim(cmd),' is design to run on 1 process' -! endif + ! if (nprocs .ne. 1 .AND. my_rank .eq. 0) then + ! print *, 'Warning: ',trim(cmd),' is design to run on 1 process' + ! endif ! Create some pretend data. do x = 1, NX @@ -79,7 +89,7 @@ program f90tst_vars3 ! Create the netCDF file. cmode = IOR(NF90_CLOBBER, NF90_64BIT_DATA) - call check(nf90mpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, ncid)) + call check(nf90mpi_create(MPI_COMM_SELF, out_path, cmode, MPI_INFO_NULL, ncid)) ! Define the dimensions. nx_ll = NX @@ -95,6 +105,8 @@ program f90tst_vars3 call check(nf90mpi_def_var(ncid, VAR4_NAME, NF90_INT, x_dimid, varid4)) call check(nf90mpi_def_var(ncid, VAR5_NAME, NF90_INT, dimids, varid5)) + call check(nf90mpi_set_fill(ncid, NF90_FILL, old_fillmode)) + call check(nf90mpi_enddef(ncid)) call check(nf90mpi_begin_indep_data(ncid)) @@ -109,7 +121,7 @@ program f90tst_vars3 call check(nf90mpi_close(ncid)) ! Reopen the file. - call check(nf90mpi_open(MPI_COMM_WORLD, filename, nf90_nowrite, MPI_INFO_NULL, ncid)) + call check(nf90mpi_open(MPI_COMM_SELF, out_path, nf90_nowrite, MPI_INFO_NULL, ncid)) ! Check some stuff out. call check(nf90mpi_inquire(ncid, ndims, nvars, ngatts, unlimdimid, file_format)) @@ -172,10 +184,21 @@ program f90tst_vars3 ! Close the file. call check(nf90mpi_close(ncid)) - msg = '*** TESTING F90 '//trim(cmd)//' for def_var API' - if (my_rank .eq. 0) call pass_fail(0, msg) + 999 timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) + + if (my_rank .eq. 0) then + if (.NOT. keep_files) then + err = nf90mpi_delete(out_path, MPI_INFO_NULL) + end if + + msg = '*** TESTING F90 '//trim(cmd)//' - def_var API' + call pass_fail(0, msg, timing) + end if - 999 call MPI_Finalize(ierr) + call MPI_Finalize(ierr) contains ! This subroutine handles errors by printing an error message and diff --git a/test/F90/f90tst_vars4.f90 b/test/F90/f90tst_vars4.f90 index 1104246e38..0afbfb8ac9 100644 --- a/test/F90/f90tst_vars4.f90 +++ b/test/F90/f90tst_vars4.f90 @@ -22,33 +22,42 @@ program f90tst_vars4 ! We need these ids and other gunk for netcdf. integer :: ncid, varid, dimids(MAX_DIMS) - integer :: x_dimid, y_dimid + integer :: x, y, x_dimid, y_dimid, old_fillmode integer :: mode_flag integer :: nvars, ngatts, ndims, unlimdimid, file_format - integer :: x, y integer, parameter :: CACHE_SIZE = 1000000 integer :: xtype_in, natts_in, dimids_in(MAX_DIMS) character (len = NF90_MAX_NAME) :: name_in integer :: err, ierr, get_args integer(KIND=MPI_OFFSET_KIND) :: nx_ll, ny_ll - character(LEN=256) filename, cmd, msg - integer my_rank, p + character(LEN=256) out_path, in_path, cmd, msg + integer my_rank, nprocs + logical keep_files + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_rank(MPI_COMM_WORLD, my_rank, ierr) - call MPI_Comm_size(MPI_COMM_WORLD, p, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any + cmd = ' ' if (my_rank .EQ. 0) then - filename = FILE_NAME - err = get_args(cmd, filename) + out_path = FILE_NAME + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) + + if (my_rank > 0) goto 999 -! if (p .ne. 1 .AND. my_rank .eq. 0) then +! if (nprocs .ne. 1 .AND. my_rank .eq. 0) then ! print *, 'Warning: ',trim(cmd),' is design to run on 1 process' ! endif @@ -61,7 +70,7 @@ program f90tst_vars4 ! Create the netCDF file. mode_flag = IOR(NF90_CLOBBER, NF90_64BIT_DATA) - call handle_err(nf90mpi_create(MPI_COMM_WORLD, filename, mode_flag, MPI_INFO_NULL, ncid)) + call handle_err(nf90mpi_create(MPI_COMM_SELF, out_path, mode_flag, MPI_INFO_NULL, ncid)) ! Define the dimensions. nx_ll = NX @@ -73,6 +82,8 @@ program f90tst_vars4 ! Define the variable. call handle_err(nf90mpi_def_var(ncid, 'data', NF90_INT, dimids, varid)) + call handle_err(nf90mpi_set_fill(ncid, NF90_FILL, old_fillmode)) + ! enddef must be called. call handle_err(nf90mpi_enddef(ncid)) @@ -85,7 +96,7 @@ program f90tst_vars4 call handle_err(nf90mpi_close(ncid)) ! Reopen the file. - call handle_err(nf90mpi_open(MPI_COMM_WORLD, filename, nf90_nowrite, MPI_INFO_NULL, ncid)) + call handle_err(nf90mpi_open(MPI_COMM_SELF, out_path, nf90_nowrite, MPI_INFO_NULL, ncid)) ! Check some stuff out. call handle_err(nf90mpi_inquire(ncid, ndims, nvars, ngatts, unlimdimid, file_format)) @@ -110,10 +121,21 @@ program f90tst_vars4 ! Close the file. call handle_err(nf90mpi_close(ncid)) - msg = '*** TESTING F90 '//trim(cmd)//' for def_var API' - if (my_rank .eq. 0) call pass_fail(0, msg) + 999 timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) + + if (my_rank .eq. 0) then + if (.NOT. keep_files) then + err = nf90mpi_delete(out_path, MPI_INFO_NULL) + end if + + msg = '*** TESTING F90 '//trim(cmd)//' - def_var API' + call pass_fail(0, msg, timing) + end if - 999 call MPI_Finalize(ierr) + call MPI_Finalize(ierr) contains ! This subroutine handles errors by printing an error message and diff --git a/test/F90/parallel_run.sh b/test/F90/parallel_run.sh index da29ea3bc1..aaba3adfe7 100755 --- a/test/F90/parallel_run.sh +++ b/test/F90/parallel_run.sh @@ -1,24 +1,30 @@ #!/bin/bash # -# Copyright (C) 2018, Northwestern University and Argonne National Laboratory +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. # # Exit immediately if a command exits with a non-zero status. set -e -VALIDATOR=../../src/utils/ncvalidator/ncvalidator -NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff +DRY_RUN=no +VERBOSE=no -# remove file system type prefix if there is any -OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $MPIRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $MPIRUN $@ + fi +} MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "MPIRUN = ${MPIRUN}" -# echo "PARALLEL_PROGS=${PARALLEL_PROGS}" +# echo "check_PROGRAMS=${check_PROGRAMS}" -# echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -27,49 +33,29 @@ fi # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS -for i in ${PARALLEL_PROGS} ; do - for j in ${safe_modes} ; do - for intra_aggr in 0 1 ; do - if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" - else - export PNETCDF_HINTS= - fi - if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" - fi - export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc - - # echo "--- validating file ${TESTOUTDIR}/$i.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc - # echo "" - - if test "x${ENABLE_BURST_BUFFER}" = x1 ; then - # echo "---- test burst buffering feature" - saved_PNETCDF_HINTS=${PNETCDF_HINTS} - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.bb.nc - export PNETCDF_HINTS=${saved_PNETCDF_HINTS} - - # echo "--- validating file ${TESTOUTDIR}/$i.bb.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.nc - - if test "$1" != "./tst_flarge" ; then - # echo "--- ncmpidiff $i.nc $i.bb.nc ---" - ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.nc ${TESTOUTDIR}/$i.bb.nc - fi - fi - - if test "x${ENABLE_NETCDF4}" = x1 ; then - # echo "test netCDF-4 feature" - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc4 4 - # Validator does not support nc4 - fi - done - done - rm -f ${OUTDIR}/$i.nc - rm -f ${OUTDIR}/$i.bb.nc -done +if test "x$MIMIC_LUSTRE" != x1 ; then + PNETCDF_HINTS="cb_nodes=2" +fi + +for i in ${check_PROGRAMS} ; do + + exe_name=`basename $i` + + for j in ${safe_modes} ; do + + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi + + if test "x$exe_name" = xtst_io ; then + run_cmd ./$i -q -o ${TESTOUTDIR} + continue + fi + + run_cmd ./$i -q -o ${TESTOUTDIR}/${exe_name}.nc + + done # safe_modes + +done # check_PROGRAMS diff --git a/test/F90/seq_runs.sh b/test/F90/seq_runs.sh index 9a816ad2d6..fa271b7a47 100755 --- a/test/F90/seq_runs.sh +++ b/test/F90/seq_runs.sh @@ -1,35 +1,49 @@ -#!/bin/sh +#!/bin/bash # -# Copyright (C) 2003, Northwestern University and Argonne National Laboratory +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. # # Exit immediately if a command exits with a non-zero status. set -e -VALIDATOR=../../src/utils/ncvalidator/ncvalidator -NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff +DRY_RUN=no +VERBOSE=no + +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $TESTSEQRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $TESTSEQRUN $@ + fi +} + +if test "x${PNETCDF_DEBUG}" = x1 ; then + safe_modes="0 1" +else + safe_modes="0" +fi + +exe_name=`basename $1` # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS -${TESTSEQRUN} ./tst_io ${TESTOUTDIR} -${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/tst_io1.nc -# remove file system type prefix if there is any -OUTDIR=$(echo $TESTOUTDIR | cut -d: -f2) -mv ${OUTDIR}/tst_io1.nc ${OUTDIR}/tst_io1.nc0 - -if test "x${ENABLE_BURST_BUFFER}" = x1 ; then - echo "" - echo "---- testing burst buffering" - export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" - ${TESTSEQRUN} ./tst_io ${TESTOUTDIR} - unset PNETCDF_HINTS - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/tst_io1.nc - - # echo "--- ncmpidiff tst_io1.nc0 tst_io1.nc ---" - ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/tst_io1.nc0 ${TESTOUTDIR}/tst_io1.nc -fi -rm -f ${OUTDIR}/tst_io1.nc0 -rm -f ${OUTDIR}/tst_io1.nc +for j in ${safe_modes} ; do + + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi + + if test "x$exe_name" = xtst_io ; then + run_cmd ./$1 -q -o ${TESTOUTDIR} + continue + fi + + run_cmd ./$1 -q -o ${TESTOUTDIR}/${exe_name}.nc + +done # safe_modes diff --git a/test/F90/test_attr_int64.f90 b/test/F90/test_attr_int64.f90 index 19b19183c8..2dd3c659fa 100644 --- a/test/F90/test_attr_int64.f90 +++ b/test/F90/test_attr_int64.f90 @@ -18,7 +18,7 @@ subroutine check(err, message) if (err .NE. NF90_NOERR) then write(6,*) trim(message), trim(nf90mpi_strerror(err)) msg = '*** TESTING F90 test_attr_int64.f90 ' - call pass_fail(1, msg) + call pass_fail(1, msg, 0) STOP 2 end if end subroutine check @@ -28,27 +28,35 @@ program main use pnetcdf implicit none - character(LEN=256) filename, cmd, msg + character(LEN=256) out_path, in_path, cmd, msg integer rank, err, ierr, ncid, cmode, get_args, xtype, varid integer(kind=MPI_OFFSET_KIND) :: buf integer,parameter :: INT2_KIND = selected_int_kind(4) integer fillmode + logical keep_files + double precision timing + + call MPI_Init(ierr) + + timing = MPI_Wtime() - call MPI_Init(err) call MPI_Comm_rank(MPI_COMM_WORLD, rank, err) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any + cmd = ' ' if (rank .EQ. 0) then - filename = 'testfile.nc' - err = get_args(cmd, filename) + out_path = 'testfile.nc' + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) cmode = IOR(NF90_64BIT_DATA, NF90_CLOBBER) - err = nf90mpi_create(MPI_COMM_WORLD, filename, cmode, & + err = nf90mpi_create(MPI_COMM_WORLD, out_path, cmode, & MPI_INFO_NULL, ncid) call check(err, 'In nf90mpi_create: ') @@ -61,7 +69,7 @@ program main if (xtype .NE. NF90_INT64) then msg = '*** TESTING F90 test_attr_int64.f90 ' - call pass_fail(1, msg) + call pass_fail(1, msg, 0) STOP 2 endif @@ -85,7 +93,7 @@ program main if (err .NE. NF90_EBADTYPE) then 10 FORMAT(A,I3) write(msg,10) '*** test_attr_int64.f90 expects NF90_EBADTYPE but got ', err - call pass_fail(1, msg) + call pass_fail(1, msg, 0) STOP 2 endif @@ -95,9 +103,20 @@ program main err = nf90mpi_close(ncid) call check(err, 'In nf90mpi_close: ') - msg = '*** TESTING F90 '//trim(cmd)//' for scalar attr of INT64 ' - if (rank .eq. 0) call pass_fail(0, msg) + 999 timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) + + if (rank .eq. 0) then + if (.NOT. keep_files) then + err = nf90mpi_delete(out_path, MPI_INFO_NULL) + end if + + msg = '*** TESTING F90 '//trim(cmd)//' - scalar attr of INT64 ' + call pass_fail(0, msg, timing) + end if - 999 call MPI_Finalize(err) + call MPI_Finalize(err) end program main diff --git a/test/F90/test_fill.f90 b/test/F90/test_fill.f90 index a35f524670..fbcd6a6dfd 100644 --- a/test/F90/test_fill.f90 +++ b/test/F90/test_fill.f90 @@ -18,25 +18,28 @@ subroutine check(err, message) if (err .NE. NF90_NOERR) then write(6,*) trim(message), trim(nf90mpi_strerror(err)) msg = '*** TESTING F90 test_fill.f90 ' - call pass_fail(1, msg) + call pass_fail(1, msg, 0) STOP 2 end if end subroutine check - integer function tst_fmt(filename, mode) + integer function tst_fmt(out_path, mode) use mpi use pnetcdf implicit none - character(LEN=256) filename - integer i, err, ierr, rank + character(LEN=256) out_path + integer i, err, ierr, rank, nprocs integer :: ncid, mode, cmode, dimid(1), varid integer(kind=MPI_OFFSET_KIND) :: start(1) integer(kind=MPI_OFFSET_KIND) :: count(1) + integer(kind=MPI_OFFSET_KIND) :: dim_len integer(kind=MPI_OFFSET_KIND), parameter :: len = 3 integer, parameter :: k = selected_int_kind(18) integer(kind=k) :: buf(len) + logical keep_files + call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) tst_fmt = 0 @@ -47,11 +50,12 @@ integer function tst_fmt(filename, mode) ! create netcdf file cmode = IOR(mode, NF90_CLOBBER) - err = nf90mpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, ncid) + err = nf90mpi_create(MPI_COMM_WORLD, out_path, cmode, MPI_INFO_NULL, ncid) call check(err, 'In nf90mpi_create: ') tst_fmt = tst_fmt + err - err = nf90mpi_def_dim(ncid, "dim", len, dimid(1)) + dim_len = len * nprocs + err = nf90mpi_def_dim(ncid, "dim", dim_len, dimid(1)) call check(err, 'In nf90mpi_def_dim: ') tst_fmt = tst_fmt + err @@ -74,7 +78,7 @@ integer function tst_fmt(filename, mode) tst_fmt = tst_fmt + err ! Write buf - start(1) = 1 + start(1) = len * rank + 1 count(1) = len err = nf90mpi_put_var_all(ncid, varid, buf, start, count) call check(err, 'In nf90mpi_put_var_all: ') @@ -90,35 +94,54 @@ program test use mpi use pnetcdf implicit none - character(LEN=256) filename, cmd, msg + character(LEN=256) out_path, in_path, cmd, msg integer err, ierr, nerrs, rank, get_args, tst_fmt + logical keep_files + double precision timing + + call MPI_Init(ierr) + + timing = MPI_Wtime() - call MPI_Init(err) call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any + cmd = ' ' if (rank .EQ. 0) then - filename = 'testfile.nc' - err = get_args(cmd, filename) + out_path = 'testfile.nc' + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, err) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, err) + + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) nerrs = 0 if (PNETCDF_DRIVER_NETCDF4 .EQ. 1) then - err = tst_fmt(filename, NF90_NETCDF4) + err = tst_fmt(out_path, NF90_NETCDF4) nerrs = nerrs + err endif - err = tst_fmt(filename, NF90_64BIT_DATA) + err = tst_fmt(out_path, NF90_64BIT_DATA) nerrs = nerrs + err - msg = '*** TESTING F90 '//trim(cmd)//' for _FillValue ' - if (rank .eq. 0) call pass_fail(nerrs, msg) + 999 timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) + + if (rank .eq. 0) then + if (.NOT. keep_files) then + err = nf90mpi_delete(out_path, MPI_INFO_NULL) + end if + + msg = '*** TESTING F90 '//trim(cmd)//' - _FillValue ' + call pass_fail(0, msg, timing) + end if - 999 call MPI_Finalize(err) + call MPI_Finalize(err) end program test diff --git a/test/F90/test_intent.f90 b/test/F90/test_intent.f90 index 70612ca86b..6bdecf206e 100644 --- a/test/F90/test_intent.f90 +++ b/test/F90/test_intent.f90 @@ -21,7 +21,7 @@ subroutine check(err, message) if (err .NE. NF90_NOERR) then write(6,*) trim(message), trim(nf90mpi_strerror(err)) msg = '*** TESTING F90 test_intent.f90 ' - call pass_fail(1, msg) + call pass_fail(1, msg, 0) STOP 2 end if end subroutine check @@ -36,12 +36,13 @@ program main FourByteInt = selected_int_kind(9), & EightByteInt = selected_int_kind(18) - character(LEN=256) filename, cmd, msg + character(LEN=256) out_path, in_path, cmd, msg integer err, ierr, rank, get_args integer cmode, ncid, varid, dimid(1), req(1), status(1) integer(kind=MPI_OFFSET_KIND) start(1) integer(kind=MPI_OFFSET_KIND) count(1) integer(kind=MPI_OFFSET_KIND) bufsize + logical keep_files character(LEN=3) cbuf integer(KIND=OneByteInt) i1buf(3) @@ -58,23 +59,30 @@ program main PARAMETER( rbuf=(/1.0,2.0,3.0/)) PARAMETER( dbuf=(/1.0,2.0,3.0/)) PARAMETER(i8buf=(/1_EightByteInt,2_EightByteInt,3_EightByteInt/)) + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any + cmd = ' ' if (rank .EQ. 0) then - filename = 'testfile.nc' - err = get_args(cmd, filename) + out_path = 'testfile.nc' + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) ! create file, truncate it if exists cmode = IOR(NF90_CLOBBER, NF90_64BIT_DATA) - err = nf90mpi_create(MPI_COMM_WORLD, filename, cmode, & + err = nf90mpi_create(MPI_COMM_WORLD, out_path, cmode, & MPI_INFO_NULL, ncid) call check(err, 'In nf90mpi_create: ') @@ -109,42 +117,57 @@ program main call check(err, 'In nfmpi_put_att_int8: ') ! define a variable of an integer array of size 3 in the nc file - err = nfmpi_def_dim(ncid, 'X', 3_MPI_OFFSET_KIND, dimid(1)) - call check(err, 'In nfmpi_def_dim: ') + err = nf90mpi_def_dim(ncid, 'X', 3_MPI_OFFSET_KIND, dimid(1)) + call check(err, 'In nf90mpi_def_dim: ') + + err = nf90mpi_def_var(ncid, 'var', NF90_INT, dimid, varid) + call check(err, 'In nf90mpi_def_var: ') - err = nfmpi_def_var(ncid, 'var', NF90_INT, 1, dimid, varid) - call check(err, 'In nfmpi_def_var: ') + ! fill with default fill value + err = nf90mpi_def_var_fill(ncid, varid, 0, NF90_FILL_INT) + call check(err, 'In nf90mpi_def_var_fill: ') - err = nfmpi_enddef(ncid) - call check(err, 'In nfmpi_enddef: ') + err = nf90mpi_enddef(ncid) + call check(err, 'In nf90mpi_enddef: ') ! bufsize must be max of data type converted before and after bufsize = 3*4 - err = nfmpi_buffer_attach(ncid, bufsize) - call check(err, 'In nfmpi_buffer_attach: ') + err = nf90mpi_buffer_attach(ncid, bufsize) + call check(err, 'In nf90mpi_buffer_attach: ') start(1) = 1 count(1) = 3 - err = nfmpi_bput_vara_int(ncid, varid, start, count, ibuf, req(1)) + err = nfmpi_bput_vara_int(ncid, varid, start, count, ibuf(1:), req(1)) call check(err, 'In nfmpi_bput_vara_int: ') - err = nfmpi_wait_all(ncid, 1, req, status) - call check(err, 'In nfmpi_wait_all: ') + err = nf90mpi_wait_all(ncid, 1, req, status) + call check(err, 'In nf90mpi_wait_all: ') if (status(1) .ne. NF90_NOERR) then - print*,'Error at bput status ', nfmpi_strerror(status(1)) + print*,'Error at bput status ', nf90mpi_strerror(status(1)) endif - err = nfmpi_buffer_detach(ncid) - call check(err, 'In nfmpi_buffer_detach: ') + err = nf90mpi_buffer_detach(ncid) + call check(err, 'In nf90mpi_buffer_detach: ') ! close the file err = nf90mpi_close(ncid) call check(err, 'In nf90mpi_close: ') - msg = '*** TESTING F90 '//trim(cmd)//' for INTENT modifier ' - if (rank .eq. 0) call pass_fail(0, msg) + 999 timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) + + if (rank .eq. 0) then + if (.NOT. keep_files) then + err = nf90mpi_delete(out_path, MPI_INFO_NULL) + end if + + msg = '*** TESTING F90 '//trim(cmd)//' - INTENT modifier ' + call pass_fail(0, msg, timing) + end if - 999 call MPI_Finalize(ierr) + call MPI_Finalize(ierr) end program main diff --git a/test/F90/tst_f90.f90 b/test/F90/tst_f90.f90 index 3369556c9a..4b96cf5dd2 100644 --- a/test/F90/tst_f90.f90 +++ b/test/F90/tst_f90.f90 @@ -72,26 +72,36 @@ program netcdfTest character (len = 20) frTimeUnits real (kind = FourByteReal), dimension(numLats) :: latVarBuf real (kind = FourByteReal), dimension(numLons) :: lonVarBuf - character(LEN=256) filename, cmd, msg - integer my_rank, p, info + character(LEN=256) out_path, in_path, cmd, msg + integer my_rank, nprocs, info integer i, nformats, old_format integer formats(2) + logical keep_files + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_rank(MPI_COMM_WORLD, my_rank, ierr) - call MPI_Comm_size(MPI_COMM_WORLD, p, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any + cmd = ' ' if (my_rank .EQ. 0) then - filename = FILE_NAME - err = get_args(cmd, filename) + out_path = FILE_NAME + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) + + if (my_rank > 0) goto 999; -! if (p .ne. 1 .AND. my_rank .eq. 0) then +! if (nprocs .ne. 1 .AND. my_rank .eq. 0) then ! print *, 'Warning: ',trim(cmd),' is design to run on 1 process' ! endif @@ -114,7 +124,7 @@ program netcdfTest ! call MPI_Info_set(info, "romio_pvfs2_posix_write", "enable",ierr) ! Create the file - call check(nf90mpi_create(MPI_COMM_WORLD, filename, nf90_clobber, info, ncFileID)) + call check(nf90mpi_create(MPI_COMM_SELF, out_path, nf90_clobber, info, ncFileID)) timeStringLen = LEN(timeString) @@ -183,15 +193,15 @@ program netcdfTest call check(nf90mpi_put_var_all(ncFileID, pressVarID, pressure(:, :, 2:2), & start = (/ 1_EightByteInt, 1_EightByteInt, 2_EightByteInt /)) ) - call check(nfmpi_begin_indep_data(ncFileID)) + call check(nf90mpi_begin_indep_data(ncFileID)) scalarVarBuf = 10 call check(nf90mpi_put_var(ncFileID, scalarVarID, scalarVarBuf)) - call check(nfmpi_end_indep_data(ncFileID)) + call check(nf90mpi_end_indep_data(ncFileID)) call check(nf90mpi_close(ncFileID)) ! Now open the file to read and check a few values - call check(nf90mpi_open(MPI_COMM_WORLD, filename, NF90_NOWRITE, info, ncFileID)) + call check(nf90mpi_open(MPI_COMM_SELF, out_path, NF90_NOWRITE, info, ncFileID)) call check(nf90mpi_inq_varid(ncFileID,"frtime",frTimeVarID)) call check(nf90mpi_get_att(ncFileID,frTimeVarID,"units",frTimeUnits)) if(frTimeUnits .ne. "hours") then @@ -202,10 +212,21 @@ program netcdfTest call MPI_Info_free(info, ierr) enddo + 999 timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) + + if (my_rank .eq. 0) then + if (.NOT. keep_files) then + err = nf90mpi_delete(out_path, MPI_INFO_NULL) + end if + msg = '*** TESTING F90 '//trim(cmd) - if (my_rank .eq. 0) call pass_fail(0, msg) + call pass_fail(0, msg, timing) + end if - 999 call MPI_Finalize(ierr) + call MPI_Finalize(ierr) contains ! Internal subroutine - checks error status after each netcdf, prints out text message each time diff --git a/test/F90/tst_f90_cdf5.f90 b/test/F90/tst_f90_cdf5.f90 index eae87baeef..ddd2c29703 100644 --- a/test/F90/tst_f90_cdf5.f90 +++ b/test/F90/tst_f90_cdf5.f90 @@ -14,56 +14,77 @@ program tst_f90_nc4 integer :: fh, cmode, err, ierr, dimid, varid, ndim, nvar, get_args character (len = *), parameter :: FILE_NAME = "tst_f90_nc4.nc" integer(KIND=MPI_OFFSET_KIND) :: ten=10 - character(LEN=256) filename, cmd, msg - integer my_rank, p, fillmode + character(LEN=256) out_path, in_path, cmd, msg + integer my_rank, nprocs, fillmode + logical keep_files + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_rank(MPI_COMM_WORLD, my_rank, ierr) - call MPI_Comm_size(MPI_COMM_WORLD, p, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any + cmd = ' ' if (my_rank .EQ. 0) then - filename = FILE_NAME - err = get_args(cmd, filename) + out_path = FILE_NAME + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) -! if (p .ne. 1 .AND. my_rank .eq. 0) then + if (my_rank > 0) goto 999; + +! if (nprocs .ne. 1 .AND. my_rank .eq. 0) then ! print *, 'Warning: ',trim(cmd),' is design to run on 1 process' ! endif cmode = IOR(NF90_CLOBBER, NF90_64BIT_DATA) - call check(nf90mpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, fh)) + call check(nf90mpi_create(MPI_COMM_SELF, out_path, cmode, MPI_INFO_NULL, fh)) call check(nf90mpi_def_dim(fh, 'fred', ten, dimid)) call check(nf90mpi_def_var(fh, 'john', NF90_INT, (/dimid/), varid)) call check(nf90mpi_set_fill(fh, NF90_FILL, fillmode)) call check(nf90mpi_close(fh)) ! Check the file. - call check(nf90mpi_open(MPI_COMM_WORLD, filename, NF90_WRITE, MPI_INFO_NULL, fh)) + call check(nf90mpi_open(MPI_COMM_SELF, out_path, NF90_WRITE, MPI_INFO_NULL, fh)) call check(nf90mpi_inquire(fh, nDimensions = ndim, nVariables = nvar)) if (nvar .ne. 1 .or. ndim .ne. 1) stop 3 call check(nf90mpi_close(fh)) - call check(nf90mpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, fh)) + call check(nf90mpi_create(MPI_COMM_SELF, out_path, cmode, MPI_INFO_NULL, fh)) call check(nf90mpi_def_dim(fh, 'fred', ten, dimid)) call check(nf90mpi_def_var(fh, 'john', NF90_INT, (/dimid/), varid)) call check(nf90mpi_set_fill(fh, NF90_FILL, fillmode)) call check(nf90mpi_close(fh)) ! Check the file. - call check(nf90mpi_open(MPI_COMM_WORLD, filename, NF90_WRITE, MPI_INFO_NULL, fh)) + call check(nf90mpi_open(MPI_COMM_SELF, out_path, NF90_WRITE, MPI_INFO_NULL, fh)) call check(nf90mpi_inquire(fh, nDimensions = ndim, nVariables = nvar)) if (nvar .ne. 1 .or. ndim .ne. 1) stop 3 call check(nf90mpi_close(fh)) - msg = '*** TESTING F90 '//trim(cmd) - if (my_rank .eq. 0) call pass_fail(0, msg) + 999 timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) + + if (my_rank .eq. 0) then + if (.NOT. keep_files) then + err = nf90mpi_delete(out_path, MPI_INFO_NULL) + end if + + msg = '*** TESTING F90 '//trim(cmd) + call pass_fail(0, msg, timing) + end if - 999 call MPI_Finalize(ierr) + call MPI_Finalize(ierr) contains ! This subroutine handles errors by printing an error message and diff --git a/test/F90/tst_io.f90 b/test/F90/tst_io.f90 index b0555374da..0d7c1042c5 100644 --- a/test/F90/tst_io.f90 +++ b/test/F90/tst_io.f90 @@ -30,26 +30,35 @@ program tst_io ! needed for netcdf integer :: ncid, x1id, x2id, x3id, x4id, vrid ! integer :: vrids, vridt, vridu, vridv, vridw, vridx, vridy, vridz - character(LEN=256) dirpath, cmd, msg - integer my_rank, p + character(LEN=256) out_path, in_path, cmd, msg + integer my_rank, nprocs integer i, nformats, old_format integer formats(2) + logical keep_files + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + + call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) call MPI_Comm_rank(MPI_COMM_WORLD, my_rank, ierr) - call MPI_Comm_size(MPI_COMM_WORLD, p, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any if (my_rank .EQ. 0) then - dirpath = '.' - err = get_args(cmd, dirpath) + out_path = '.' + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(dirpath, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) + + if (my_rank > 0) goto 999; -! if (p .ne. 1 .AND. my_rank .eq. 0) then +! if (nprocs .ne. 1 .AND. my_rank .eq. 0) then ! print *, 'Warning: ',trim(cmd),' is design to run on 1 process' ! endif @@ -80,13 +89,13 @@ program tst_io do i = nformats, 2 call check(nf90mpi_set_default_format(formats(i), old_format), 10) - ! call setupNetCDF (trim(dirpath)//'/'//nclFilenm1, ncid, vrid, x, prsz1, prsz2, prsz3, prsz4, & - call setupNetCDF (trim(dirpath)//'/'//nclFilenm1, ncid, vrid, prsz1, prsz2, prsz3, prsz4, & + ! call setupNetCDF (trim(out_path)//'/'//nclFilenm1, ncid, vrid, x, prsz1, prsz2, prsz3, prsz4, & + call setupNetCDF (trim(out_path)//'/'//nclFilenm1, ncid, vrid, prsz1, prsz2, prsz3, prsz4, & x1id, x2id, x3id, x4id, NF90_CLOBBER, 20) call system_clock(start) - call check(nfmpi_begin_indep_data(ncid), 11) + call check(nf90mpi_begin_indep_data(ncid), 11) call check (NF90MPI_PUT_VAR(ncid, vrid, x), 18) - call check(nfmpi_end_indep_data(ncid), 12) + call check(nf90mpi_end_indep_data(ncid), 12) call system_clock(now) ncint1 = now - start ! print 3, size, "MB"," netcdf write = ", ncint1 * clockRate, & @@ -97,12 +106,12 @@ program tst_io call system_clock(start) do i1 = 1, repct - ! call setupNetCDF (trim(dirpath)//'/'//nclFilenm1, ncid, vrid, x, prsz1, prsz2, prsz3, prsz4, & - call setupNetCDF (trim(dirpath)//'/'//nclFilenm1, ncid, vrid, prsz1, prsz2, prsz3, prsz4, & + ! call setupNetCDF (trim(out_path)//'/'//nclFilenm1, ncid, vrid, x, prsz1, prsz2, prsz3, prsz4, & + call setupNetCDF (trim(out_path)//'/'//nclFilenm1, ncid, vrid, prsz1, prsz2, prsz3, prsz4, & x1id, x2id, x3id, x4id, NF90_CLOBBER, 130) - call check(nfmpi_begin_indep_data(ncid), 11) + call check(nf90mpi_begin_indep_data(ncid), 11) call check (NF90MPI_PUT_VAR(ncid, vrid, x), 23 + i1) - call check(nfmpi_end_indep_data(ncid), 11) + call check(nf90mpi_end_indep_data(ncid), 11) call check (NF90MPI_CLOSE(ncid), 15) enddo call system_clock(now) @@ -112,23 +121,23 @@ program tst_io ! 4 format("Time for", i5, "MB", i3, a22, i7, " msec. Spd ratio = ", f5.2) ! call system_clock(start) - ! call setupNetCDF (trim(dirpath)//'/'//nclFilenm3, ncid, vrids, s, prsz1, prsz2, prsz3, prsz4, & + ! call setupNetCDF (trim(out_path)//'/'//nclFilenm3, ncid, vrids, s, prsz1, prsz2, prsz3, prsz4, & ! x1id, x2id, x3id, x4id, NF90_CLOBBER, 20) - ! call setupNetCDF (trim(dirpath)//'/'//nclFilenm4, ncid, vridt, t, prsz1, prsz2, prsz3, prsz4, & + ! call setupNetCDF (trim(out_path)//'/'//nclFilenm4, ncid, vridt, t, prsz1, prsz2, prsz3, prsz4, & ! x1id, x2id, x3id, x4id, NF90_CLOBBER, 30) - ! call setupNetCDF (trim(dirpath)//'/'//nclFilenm5, ncid, vridu, u, prsz1, prsz2, prsz3, prsz4, & + ! call setupNetCDF (trim(out_path)//'/'//nclFilenm5, ncid, vridu, u, prsz1, prsz2, prsz3, prsz4, & ! x1id, x2id, x3id, x4id, NF90_CLOBBER, 40) - ! call setupNetCDF (trim(dirpath)//'/'//nclFilenm6, ncid, vridv, v, prsz1, prsz2, prsz3, prsz4, & + ! call setupNetCDF (trim(out_path)//'/'//nclFilenm6, ncid, vridv, v, prsz1, prsz2, prsz3, prsz4, & ! x1id, x2id, x3id, x4id, NF90_CLOBBER, 50) - ! call setupNetCDF (trim(dirpath)//'/'//nclFilenm7, ncid, vridw, w, prsz1, prsz2, prsz3, prsz4, & + ! call setupNetCDF (trim(out_path)//'/'//nclFilenm7, ncid, vridw, w, prsz1, prsz2, prsz3, prsz4, & ! x1id, x2id, x3id, x4id, NF90_CLOBBER, 60) - ! call setupNetCDF (trim(dirpath)//'/'//nclFilenm8, ncid, vridx, x, prsz1, prsz2, prsz3, prsz4, & + ! call setupNetCDF (trim(out_path)//'/'//nclFilenm8, ncid, vridx, x, prsz1, prsz2, prsz3, prsz4, & ! x1id, x2id, x3id, x4id, NF90_CLOBBER, 70) - ! call setupNetCDF (trim(dirpath)//'/'//nclFilenm9, ncid, vridy, y, prsz1, prsz2, prsz3, prsz4, & + ! call setupNetCDF (trim(out_path)//'/'//nclFilenm9, ncid, vridy, y, prsz1, prsz2, prsz3, prsz4, & ! x1id, x2id, x3id, x4id, NF90_CLOBBER, 80) - ! call setupNetCDF (trim(dirpath)//'/'//nclFilenm10, ncid, vridz, z, prsz1, prsz2, prsz3, prsz4, & + ! call setupNetCDF (trim(out_path)//'/'//nclFilenm10, ncid, vridz, z, prsz1, prsz2, prsz3, prsz4, & ! x1id, x2id, x3id, x4id, NF90_CLOBBER, 90) - ! call check(nfmpi_begin_indep_data(ncid), 11) + ! call check(nf90mpi_begin_indep_data(ncid), 11) ! call check (NF90MPI_PUT_VAR(ncid, vrids, s), 118) ! call check (NF90MPI_PUT_VAR(ncid, vridt, t), 119) ! call check (NF90MPI_PUT_VAR(ncid, vridu, u), 120) @@ -137,17 +146,29 @@ program tst_io ! call check (NF90MPI_PUT_VAR(ncid, vridx, x), 123) ! call check (NF90MPI_PUT_VAR(ncid, vridy, y), 124) ! call check (NF90MPI_PUT_VAR(ncid, vridz, z), 125) - ! call check(nfmpi_end_indep_data(ncid), 11) + ! call check(nf90mpi_end_indep_data(ncid), 11) ! call system_clock(now) ! ncint3 = now - start ! call check (NF90MPI_CLOSE(ncid), 16) ! print 4, size, 8, " netcdf file writes = ", ncint3 * clockRate, & ! real(ncint3)/real(wrint3); enddo - msg = '*** TESTING F90 '//trim(cmd) - if (my_rank .eq. 0) call pass_fail(0, msg) - 999 call MPI_Finalize(ierr) + 999 timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) + + if (my_rank .eq. 0) then + if (.NOT. keep_files) then + err = nf90mpi_delete(trim(out_path)//'/'//nclFilenm1, MPI_INFO_NULL) + end if + + msg = '*** TESTING F90 '//trim(cmd) + call pass_fail(0, msg, timing) + end if + + call MPI_Finalize(ierr) contains subroutine check (st, n) ! checks the return error code @@ -170,7 +191,7 @@ subroutine setupNetCDF(fn, nc, vr, d1, d2, d3, d4, do1, do2, & integer, intent(inout) :: nc integer, dimension(4) :: dimids (4) - call check (NF90MPI_CREATE (MPI_COMM_WORLD, fn, stat, MPI_INFO_NULL, nc), deb + 1) + call check (NF90MPI_CREATE (MPI_COMM_SELF, fn, stat, MPI_INFO_NULL, nc), deb + 1) call check (NF90MPI_DEF_DIM(nc, "d1", d1, do1), deb + 2) call check (NF90MPI_DEF_DIM(nc, "d2", d2, do2), deb + 3) call check (NF90MPI_DEF_DIM(nc, "d3", d3, do3), deb + 4) diff --git a/test/F90/tst_types2.f90 b/test/F90/tst_types2.f90 index f1506eb76e..3e40549d2e 100644 --- a/test/F90/tst_types2.f90 +++ b/test/F90/tst_types2.f90 @@ -34,26 +34,31 @@ program tst_types2 integer :: cmode, err, ierr, get_args integer(KIND=MPI_OFFSET_KIND) :: dlen_ll - character(LEN=256) filename, cmd, msg - integer my_rank, p - logical verbose + character(LEN=256) out_path, in_path, cmd, msg + integer my_rank, nprocs + logical verbose, keep_files + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_rank(MPI_COMM_WORLD, my_rank, ierr) - call MPI_Comm_size(MPI_COMM_WORLD, p, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any + cmd = ' ' if (my_rank .EQ. 0) then - filename = FILE_NAME - err = get_args(cmd, filename) + out_path = FILE_NAME + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) verbose = .FALSE. - if (p .ne. 4 .AND. my_rank .eq. 0 .AND. verbose) then + if (nprocs .ne. 4 .AND. my_rank .eq. 0 .AND. verbose) then print *, 'Warning: ',trim(cmd),' is design to run on 4 processes.' endif @@ -123,7 +128,7 @@ program tst_types2 ! Create the netCDF file. cmode = IOR(NF90_CLOBBER, NF90_64BIT_DATA) - call check(nf90mpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, ncid)) + call check(nf90mpi_create(MPI_COMM_WORLD, out_path, cmode, MPI_INFO_NULL, ncid)) ! Define dimensions. dlen_ll = DLEN @@ -189,7 +194,7 @@ program tst_types2 call check(nf90mpi_close(ncid)) ! Reopen the netCDF file. - call check(nf90mpi_open(MPI_COMM_WORLD, filename, nf90_nowrite, MPI_INFO_NULL, ncid)) + call check(nf90mpi_open(MPI_COMM_WORLD, out_path, nf90_nowrite, MPI_INFO_NULL, ncid)) ! Read in the large numbers. call check(nf90mpi_get_var_all(ncid, varid1, data1_in)) @@ -272,10 +277,21 @@ program tst_types2 ! Close the file. call check(nf90mpi_close(ncid)) - msg = '*** TESTING F90 '//trim(cmd)//' for 64-bit integer types' - if (my_rank .eq. 0) call pass_fail(0, msg) + 999 timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) + + if (my_rank .eq. 0) then + if (.NOT. keep_files) then + err = nf90mpi_delete(out_path, MPI_INFO_NULL) + end if + + msg = '*** TESTING F90 '//trim(cmd)//' - 64-bit integer types' + call pass_fail(0, msg, timing) + end if - 999 call MPI_Finalize(ierr) + call MPI_Finalize(ierr) ! This subroutine handles errors by printing an error message and ! exiting with a non-zero status. diff --git a/test/F90/wrap_runs.sh b/test/F90/wrap_runs.sh index 716aacf063..08534a44b1 100755 --- a/test/F90/wrap_runs.sh +++ b/test/F90/wrap_runs.sh @@ -16,7 +16,7 @@ outfile=`basename $1` OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -26,8 +26,30 @@ fi unset PNETCDF_HINTS for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi +for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + else + USEMPIO_HINTS="nc_pncio=enable" + fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.nc ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.nc # echo "" @@ -36,7 +58,7 @@ for j in ${safe_modes} ; do echo "" echo "---- testing burst buffering" - export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" + export PNETCDF_HINTS="$PNETCDF_HINTS;nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.bb.nc unset PNETCDF_HINTS ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.bb.nc @@ -49,6 +71,7 @@ for j in ${safe_modes} ; do fi fi done +done rm -f ${OUTDIR}/$outfile.nc rm -f ${OUTDIR}/$outfile.bb.nc diff --git a/test/Makefile.am b/test/Makefile.am index f5eb9d5b4a..7d94074039 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -64,6 +64,8 @@ else PTEST_SUBDIRS = $(SUBDIRS) endif +EXTRA_DIST = parallel_run.sh + ptest: @for d in $(PTEST_SUBDIRS) ; do \ $(MAKE) $(MFLAGS) -C $$d ptest $$* || exit 1 ; \ diff --git a/test/adios/Makefile.am b/test/adios/Makefile.am index 7a670fa3e7..96de1154e9 100644 --- a/test/adios/Makefile.am +++ b/test/adios/Makefile.am @@ -58,21 +58,31 @@ TESTS_ENVIRONMENT += export srcdir="$(srcdir)"; TESTS_ENVIRONMENT += export TESTOUTDIR="$(TESTOUTDIR)"; TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; TESTS_ENVIRONMENT += export TESTMPIRUN="$(TESTMPIRUN)"; -TESTS_ENVIRONMENT += export PNETCDF_DEBUG="$(PNETCDF_DEBUG)"; TESTS_ENVIRONMENT += export TESTPROGRAMS="$(TESTPROGRAMS)"; TESTS_ENVIRONMENT += export check_PROGRAMS="$(check_PROGRAMS)"; TESTS_ENVIRONMENT += export ADIOS_VER_GE_1132="$(ADIOS_VER_GE_1132)"; +if PNETCDF_DEBUG + TESTS_ENVIRONMENT += export PNETCDF_DEBUG=1; +endif +if ENABLE_BURST_BUFFER + TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER=1; +endif +if ENABLE_THREAD_SAFE + TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE=1; +endif +if MIMIC_LUSTRE + TESTS_ENVIRONMENT += export MIMIC_LUSTRE=1; +endif + TESTS = $(TESTPROGRAMS) TEST_EXTENSIONS = .sh LOG_COMPILER = $(srcdir)/wrap_runs.sh SH_LOG_COMPILER = -NC_FILES = $(TESTPROGRAMS:%=$(TESTOUTDIR)/%.nc) - CLEANFILES = $(M4_SRCS:.m4=.c) \ $(TESTOUTDIR)/put_get_all_kinds.nc.cdf4 \ - $(NC_FILES) testfile.nc + $(TESTOUTDIR)/*.nc $(TESTOUTDIR)/*bb EXTRA_DIST = $(M4_SRCS) wrap_runs.sh parallel_run.sh \ arrays_big.bp arrays.bp attributes_big.bp attributes.bp diff --git a/test/adios/att.c b/test/adios/att.c index 102edd6c10..f6cd0bca7b 100644 --- a/test/adios/att.c +++ b/test/adios/att.c @@ -130,7 +130,7 @@ int main(int argc, char** argv) { MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf("pass\n"); } MPI_Finalize(); diff --git a/test/adios/header.c b/test/adios/header.c index 52a3892456..b7f7c9008d 100644 --- a/test/adios/header.c +++ b/test/adios/header.c @@ -130,7 +130,7 @@ int main(int argc, char** argv) { MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf("pass\n"); } MPI_Finalize(); diff --git a/test/adios/indep.c b/test/adios/indep.c index c352b77f90..1d4e4f5c86 100644 --- a/test/adios/indep.c +++ b/test/adios/indep.c @@ -99,7 +99,7 @@ int main(int argc, char** argv) { MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf("pass\n"); } MPI_Finalize(); diff --git a/test/adios/ivar.c b/test/adios/ivar.c index 07e75db54a..10821893cd 100644 --- a/test/adios/ivar.c +++ b/test/adios/ivar.c @@ -211,7 +211,7 @@ int main(int argc, char** argv) { MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf("pass\n"); } MPI_Finalize(); diff --git a/test/adios/ivarm.c b/test/adios/ivarm.c index c7ba84a516..2399cc477f 100644 --- a/test/adios/ivarm.c +++ b/test/adios/ivarm.c @@ -95,7 +95,7 @@ int main(int argc, char** argv) { MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf("pass\n"); } MPI_Finalize(); diff --git a/test/adios/ivars.c b/test/adios/ivars.c index ba7a783bc6..d2d1cb7047 100644 --- a/test/adios/ivars.c +++ b/test/adios/ivars.c @@ -95,7 +95,7 @@ int main(int argc, char** argv) { MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf("pass\n"); } MPI_Finalize(); diff --git a/test/adios/open.c b/test/adios/open.c index a1c7117869..3eaf594497 100644 --- a/test/adios/open.c +++ b/test/adios/open.c @@ -64,7 +64,7 @@ int main(int argc, char** argv) { MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf("pass\n"); } MPI_Finalize(); diff --git a/test/adios/parallel_run.sh b/test/adios/parallel_run.sh index 612fd7591a..5678954ff1 100755 --- a/test/adios/parallel_run.sh +++ b/test/adios/parallel_run.sh @@ -15,7 +15,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "check_PROGRAMS=${check_PROGRAMS}" # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -26,16 +26,49 @@ unset PNETCDF_HINTS for i in ${check_PROGRAMS} ; do for j in ${safe_modes} ; do - for intra_aggr in 0 1 ; do if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi + for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" else - export PNETCDF_HINTS= + USEMPIO_HINTS="nc_pncio=enable" fi + for intra_aggr in 0 1 ; do if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" + INA_HINTS="nc_num_aggrs_per_node=2" + else + INA_HINTS="nc_num_aggrs_per_node=0" + fi + + if [[ "$i" == *"vard"* ]] ; then + if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then + # vard APIs are not supported when using PNCIO + continue + fi fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + if test "x$INA_HINTS" != x ; then + PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS" + fi + if test "x$MIMIC_LUSTRE" != x1 ; then + PNETCDF_HINTS="cb_nodes=2;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" if test "$i" = open ; then ${MPIRUN} ./$i ${srcdir}/arrays.bp @@ -51,5 +84,6 @@ for i in ${check_PROGRAMS} ; do fi done done + done done diff --git a/test/adios/var.c b/test/adios/var.c index ecf86d53a5..4f982df106 100644 --- a/test/adios/var.c +++ b/test/adios/var.c @@ -154,7 +154,7 @@ int main(int argc, char** argv) { MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf("pass\n"); } MPI_Finalize(); diff --git a/test/adios/varm.c b/test/adios/varm.c index a60d2aa909..369ee1b999 100644 --- a/test/adios/varm.c +++ b/test/adios/varm.c @@ -95,7 +95,7 @@ int main(int argc, char** argv) { MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf("pass\n"); } MPI_Finalize(); diff --git a/test/adios/vars.c b/test/adios/vars.c index 37fbf25365..74ec33b10f 100644 --- a/test/adios/vars.c +++ b/test/adios/vars.c @@ -92,7 +92,7 @@ int main(int argc, char** argv) { MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf("pass\n"); } MPI_Finalize(); diff --git a/test/adios/wrap_runs.sh b/test/adios/wrap_runs.sh index e619098d06..5de664572e 100755 --- a/test/adios/wrap_runs.sh +++ b/test/adios/wrap_runs.sh @@ -12,7 +12,7 @@ NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -22,8 +22,19 @@ fi unset PNETCDF_HINTS for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + if test "$1" = ./open ; then ${TESTSEQRUN} $1 ${srcdir}/arrays.bp ${TESTSEQRUN} $1 ${srcdir}/attributes.bp diff --git a/test/burst_buffer/Makefile.am b/test/burst_buffer/Makefile.am index dde2d18ebc..ddd04aeb83 100644 --- a/test/burst_buffer/Makefile.am +++ b/test/burst_buffer/Makefile.am @@ -32,23 +32,18 @@ check_PROGRAMS = bb_bsize \ highdim \ varn -EXTRA_DIST = wrap_runs.sh parallel_run.sh +EXTRA_DIST = seq_runs.sh parallel_run.sh -NC_FILES = $(check_PROGRAMS:%=$(TESTOUTDIR)/%.nc) -META_FILES = $(NC_FILES:%=%_*.meta) -DATA_FILES = $(NC_FILES:%=%_*.data) - -CLEANFILES = $(NC_FILES) core core.* *.gcda *.gcno *.gcov gmon.out \ - $(META_FILES) $(DATA_FILES) +CLEANFILES = core core.* *.gcda *.gcno *.gcov gmon.out \ + $(TESTOUTDIR)/*.nc $(TESTOUTDIR)/*bb \ + $(TESTOUTDIR)/*.meta $(TESTOUTDIR)/*data ../common/libtestutils.la: set -e; cd ../common && $(MAKE) $(MFLAGS) tests -TESTPROGRAMS = $(check_PROGRAMS) - # autimake 1.11.3 has not yet implemented AM_TESTS_ENVIRONMENT # For newer versions, we can use AM_TESTS_ENVIRONMENT instead -# AM_TESTS_ENVIRONMENT = export TESTPROGRAMS="$(TESTPROGRAMS)"; +# AM_TESTS_ENVIRONMENT = export check_PROGRAMS="$(check_PROGRAMS)"; # AM_TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; # AM_TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT = export SED="$(SED)"; @@ -56,42 +51,51 @@ TESTS_ENVIRONMENT += export srcdir="$(srcdir)"; TESTS_ENVIRONMENT += export TESTOUTDIR="$(TESTOUTDIR)"; TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; TESTS_ENVIRONMENT += export TESTMPIRUN="$(TESTMPIRUN)"; -TESTS_ENVIRONMENT += export PNETCDF_DEBUG="$(PNETCDF_DEBUG)"; -TESTS_ENVIRONMENT += export TESTPROGRAMS="$(TESTPROGRAMS)"; TESTS_ENVIRONMENT += export check_PROGRAMS="$(check_PROGRAMS)"; -TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER="$(ENABLE_BURST_BUFFER)"; +if PNETCDF_DEBUG + TESTS_ENVIRONMENT += export PNETCDF_DEBUG=1; +endif +if ENABLE_BURST_BUFFER + TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER=1; +endif +if ENABLE_THREAD_SAFE + TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE=1; +endif +if MIMIC_LUSTRE + TESTS_ENVIRONMENT += export MIMIC_LUSTRE=1; +endif + +TESTS = $(check_PROGRAMS) TEST_EXTENSIONS = .sh -LOG_COMPILER = $(srcdir)/wrap_runs.sh +LOG_COMPILER = $(srcdir)/seq_runs.sh SH_LOG_COMPILER = -TESTS = $(TESTPROGRAMS) - # Some of these tests are designed to run on one processes, # Run them on 4 processes to see if they can handle well -ptest ptest4: $(TESTPROGRAMS) +ptest ptest4: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 4 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ $(srcdir)/parallel_run.sh 4 || exit 1 -ptest2: $(TESTPROGRAMS) +ptest2: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 2 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ $(srcdir)/parallel_run.sh 2 || exit 1 -ptest6: $(TESTPROGRAMS) +ptest6: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 6 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ $(srcdir)/parallel_run.sh 6 || exit 1 -ptests: ptest2 ptest4 ptest6 +ptests: ptest4 ptest6 ptest8 ptest10: # build check targets but not invoke diff --git a/test/burst_buffer/bb_bsize.c b/test/burst_buffer/bb_bsize.c index 2891f42edf..54ef40b529 100644 --- a/test/burst_buffer/bb_bsize.c +++ b/test/burst_buffer/bb_bsize.c @@ -23,89 +23,64 @@ #include #include #include -#include +#include /* dirname() */ #define SIZE 1024 int buffer[SIZE * SIZE]; char bsize[32]; -int main(int argc, char *argv[]) { - int i, ret = NC_NOERR, nerr = 0; - int rank, np; - int ncid, varid; - int dimid[2]; - char *filename; +static +int test_bb(const char *out_path, + int coll_io, + MPI_Info info) +{ + char *folder, *dup_out_path; + int i, err = NC_NOERR, nerrs = 0; + int rank, np, ncid, varid, dimid[2]; MPI_Offset start[2], count[2]; - MPI_Info info; - /* Initialize MPI */ - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &np); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n", argv[0]); - MPI_Finalize(); - return 1; - } - - /* Determine test file name */ - if (argc > 1) - filename = argv[1]; - else - filename = "testfile.nc"; - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for checking request > buffer size", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - - /* Initialize file info */ - MPI_Info_create(&info); + /* add file info */ MPI_Info_set(info, "nc_burst_buf", "enable"); + /* Set default buffer size to 1/16 of the rows */ sprintf(bsize, "%u", (unsigned int)(SIZE * SIZE / 16 * sizeof(int))); MPI_Info_set(info, "nc_burst_buf_flush_buffer_size", bsize); + MPI_Info_set(info, "nc_burst_buf_overwrite", "enable"); + + dup_out_path = strdup(out_path); + folder = dirname(dup_out_path); + if (folder == NULL) + MPI_Info_set(info, "nc_burst_buf_dirname", "."); + else + MPI_Info_set(info, "nc_burst_buf_dirname", folder); + free(dup_out_path); + /* Create new netcdf file */ - ret = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, info, &ncid); - if (ret != NC_NOERR) { - printf("Error at line %d in %s: ncmpi_create: %d\n", __LINE__, __FILE__, ret); - nerr++; - goto ERROR; - } + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR /* Define dimensions */ - ret = ncmpi_def_dim(ncid, "X", SIZE * np, dimid); - if (ret != NC_NOERR) { - printf("Error at line %d in %s: ncmpi_def_dim: %d\n", __LINE__, __FILE__, ret); - nerr++; - goto ERROR; - } - ret = ncmpi_def_dim(ncid, "Y", SIZE, dimid + 1); - if (ret != NC_NOERR) { - printf("Error at line %d in %s: ncmpi_def_dim: %d\n", __LINE__, __FILE__, ret); - nerr++; - goto ERROR; - } + err = ncmpi_def_dim(ncid, "X", SIZE * np, dimid); + CHECK_ERR + err = ncmpi_def_dim(ncid, "Y", SIZE, dimid + 1); + CHECK_ERR /* Define variable */ - ret = ncmpi_def_var(ncid, "M", NC_INT, 2, dimid, &varid); - if (ret != NC_NOERR) { - printf("Error at line %d in %s: ncmpi_def_var: %d\n", __LINE__, __FILE__, ret); - nerr++; - goto ERROR; - } + err = ncmpi_def_var(ncid, "M", NC_INT, 2, dimid, &varid); + CHECK_ERR /* Switch to data mode */ - ret = ncmpi_enddef(ncid); - if (ret != NC_NOERR) { - printf("Error at line %d in %s: ncmpi_enddef: %d\n", __LINE__, __FILE__, ret); - nerr++; - goto ERROR; + err = ncmpi_enddef(ncid); + CHECK_ERR + + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR } /* Initialize buffer */ @@ -118,12 +93,11 @@ int main(int argc, char *argv[]) { start[1] = 0; count[0] = SIZE / 8; count[1] = SIZE; - ret = ncmpi_put_vara_int_all(ncid, varid, start, count, buffer); - if (ret != NC_NOERR) { - printf("Error at line %d in %s: ncmpi_put_vara_int: %d\n", __LINE__, __FILE__, ret); - nerr++; - goto ERROR; - } + if (coll_io) + err = ncmpi_put_vara_int_all(ncid, varid, start, count, buffer); + else + err = ncmpi_put_vara_int(ncid, varid, start, count, buffer); + CHECK_ERR /* Write remaining rows */ start[0] = SIZE * rank + SIZE / 8; @@ -131,12 +105,11 @@ int main(int argc, char *argv[]) { count[0] = 1; count[1] = SIZE; for (; start[0] < SIZE * (rank + 1); start[0]++) { - ret = ncmpi_put_vara_int_all(ncid, varid, start, count, buffer); - if (ret != NC_NOERR) { - printf("Error at line %d in %s: ncmpi_put_vara_int: %d\n", __LINE__, __FILE__, ret); - nerr++; - goto ERROR; - } + if (coll_io) + err = ncmpi_put_vara_int_all(ncid, varid, start, count, buffer); + else + err = ncmpi_put_vara_int(ncid, varid, start, count, buffer); + CHECK_ERR } /* @@ -148,48 +121,78 @@ int main(int argc, char *argv[]) { start[1] = 0; count[0] = SIZE; count[1] = SIZE; - ret = ncmpi_get_vara_int_all(ncid, varid, start, count, buffer); - if (ret != NC_NOERR) { - printf("Error at line %d in %s: ncmpi_get_vara_int: %d\n", __LINE__, __FILE__, ret); - nerr++; - goto ERROR; - } + if (coll_io) + err = ncmpi_get_vara_int_all(ncid, varid, start, count, buffer); + else + err = ncmpi_get_vara_int(ncid, varid, start, count, buffer); + CHECK_ERR /* Verify the result */ for (i = 0; i < SIZE * SIZE; i++) { if (buffer[i] != rank + 1) { - nerr++; - printf("Error at line %d in %s: expecting buffer[%d] = %d but got %d\n", __LINE__, __FILE__, i, rank + 1, buffer[i]); + printf("Error at line %d in %s: expecting buffer[%d] = %d but got %d\n", + __LINE__, __FILE__, i, rank + 1, buffer[i]); + nerrs++; + goto err_out; } } /* Close the file */ - ret = ncmpi_close(ncid); - if (ret != NC_NOERR) { - printf("Error at line %d in %s: ncmpi_close: %d\n", __LINE__, __FILE__, ret); - nerr++; - goto ERROR; - } + err = ncmpi_close(ncid); + CHECK_ERR - MPI_Info_free(&info); +err_out: + return nerrs; +} - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - ret = ncmpi_inq_malloc_size(&malloc_size); - if (ret == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", sum_size); - } +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ + int err=NC_NOERR; + MPI_Info local_info; + + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + MPI_Info_dup(info, &local_info); + err = test_bb(out_path, coll_io, local_info); + MPI_Info_free(&local_info); + + MPI_Info_dup(info, &local_info); + MPI_Info_set(local_info, "nc_burst_buf_shared_logs", "enable"); + err = test_bb(out_path, coll_io, local_info); + MPI_Info_free(&local_info); + + return err; +} -ERROR: - MPI_Allreduce(MPI_IN_PLACE, &nerr, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerr) printf(FAIL_STR, nerr); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "request size > buffer size", opt, test_io); MPI_Finalize(); - return nerr > 0; + return err; } diff --git a/test/burst_buffer/bb_hints.c b/test/burst_buffer/bb_hints.c index 4a3fa054d6..fbc06d06d9 100644 --- a/test/burst_buffer/bb_hints.c +++ b/test/burst_buffer/bb_hints.c @@ -23,103 +23,129 @@ #include #include #include -#include /* basename() */ +#include /* dirname() */ #include #include -int main(int argc, char** argv) { - char filename[256]; - int rank, nprocs, err, flag, nerrs=0; - int log_enabled; - int ncid; - MPI_Info info, infoused; +static +int test_bb(const char *out_path, + MPI_Info info) +{ + char *folder, *dup_out_path; + int err, flag, nerrs=0, ncid; + MPI_Info infoused; char hint[MPI_MAX_INFO_VAL]; - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for checking offsets of new variables ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - - MPI_Info_create(&info); + MPI_Info_set(info, "nc_burst_buf", "enable"); MPI_Info_set(info, "nc_burst_buf_del_on_close", "disable"); MPI_Info_set(info, "nc_burst_buf_flush_buffer_size", "256"); /* MPI_Info_set(info, "nc_burst_buf_dirname", "()@^$@!(_&$)@(#%%&)(*#$"); */ - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, info, &ncid); CHECK_ERR - err = ncmpi_inq_file_info(ncid, &infoused); CHECK_ERR + MPI_Info_set(info, "nc_burst_buf_overwrite", "enable"); - MPI_Info_get(infoused, "nc_burst_buf", MPI_MAX_INFO_VAL - 1, hint, &flag); - if (flag && strcasecmp(hint, "enable") == 0) - log_enabled = 1; + dup_out_path = strdup(out_path); + folder = dirname(dup_out_path); + if (folder == NULL) + MPI_Info_set(info, "nc_burst_buf_dirname", "."); else - log_enabled = 0; - - if (log_enabled) { - MPI_Info_get(infoused, "nc_burst_buf_del_on_close", MPI_MAX_INFO_VAL - 1, hint, &flag); - if (flag) { - if (strcmp(hint, "disable") != 0) { - printf("Error at line %d: unexpected nc_burst_buf_del_on_close = %s, but got %s\n", __LINE__, "disable", hint); - nerrs++; - } - } - else{ - printf("Error at line %d: nc_burst_buf_del_on_close is not set\n", __LINE__); + MPI_Info_set(info, "nc_burst_buf_dirname", folder); + free(dup_out_path); + + /* Create new netcdf file */ + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR + + err = ncmpi_inq_file_info(ncid, &infoused); CHECK_ERR + + MPI_Info_get(infoused, "nc_burst_buf_del_on_close", MPI_MAX_INFO_VAL - 1, hint, &flag); + if (flag) { + if (strcmp(hint, "disable") != 0) { + printf("Error at line %d: unexpected nc_burst_buf_del_on_close = %s, but got %s\n", + __LINE__, "disable", hint); nerrs++; + goto err_out; } + } + else{ + printf("Error at line %d: nc_burst_buf_del_on_close is not set\n", __LINE__); + nerrs++; + goto err_out; + } - MPI_Info_get(infoused, "nc_burst_buf_flush_buffer_size", MPI_MAX_INFO_VAL - 1, hint, &flag); - if (flag) { - if (strcmp(hint, "256") != 0) { - printf("Error at line %d: unexpected nc_burst_buf_flush_buffer_size = %s, but got %s\n", __LINE__, "256", hint); - nerrs++; - } - } - else{ - printf("Error at line %d: nc_burst_buf_flush_buffer_size is not set\n", __LINE__); + MPI_Info_get(infoused, "nc_burst_buf_flush_buffer_size", MPI_MAX_INFO_VAL - 1, hint, &flag); + if (flag) { + if (strcmp(hint, "256") != 0) { + printf("Error at line %d: unexpected nc_burst_buf_flush_buffer_size = %s, but got %s\n", + __LINE__, "256", hint); nerrs++; + goto err_out; } } + else{ + printf("Error at line %d: nc_burst_buf_flush_buffer_size is not set\n", __LINE__); + nerrs++; + goto err_out; + } err = ncmpi_enddef(ncid); CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR - MPI_Info_free(&info); MPI_Info_free(&infoused); - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - } +err_out: + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) +{ + int err=NC_NOERR; + MPI_Info local_info; + + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + MPI_Info_dup(info, &local_info); + err = test_bb(out_path, local_info); + MPI_Info_free(&local_info); + + MPI_Info_dup(info, &local_info); + MPI_Info_set(local_info, "nc_burst_buf_shared_logs", "enable"); + err = test_bb(out_path, local_info); + MPI_Info_free(&local_info); + + return err; +} + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "burt buffering hints", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/burst_buffer/bb_many_reqs.c b/test/burst_buffer/bb_many_reqs.c index a379134ac7..2f56d1aece 100644 --- a/test/burst_buffer/bb_many_reqs.c +++ b/test/burst_buffer/bb_many_reqs.c @@ -23,56 +23,48 @@ #define NREQ 2048 #define NROUND 4 -int main(int argc, char *argv[]) { - int i, j, err, nerrs = 0; - int rank, np; - int ncid, varid; +static +int test_bb(const char *out_path, + int coll_io, + MPI_Info info) +{ + char *folder, *dup_out_path; + int i, j, err, nerrs = 0, rank, np, ncid, varid, dimid[2]; int *buffer, *reqs, *stat; - int dimid[2]; - char *filename; MPI_Offset start[2]; - MPI_Info info; - /* Initialize MPI */ - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &np); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n", argv[0]); - MPI_Finalize(); - return 1; - } + MPI_Info_set(info, "nc_burst_buf", "enable"); + MPI_Info_set(info, "nc_burst_buf_overwrite", "enable"); - /* Determine test file name */ - if (argc > 1) - filename = argv[1]; + dup_out_path = strdup(out_path); + folder = dirname(dup_out_path); + if (folder == NULL) + MPI_Info_set(info, "nc_burst_buf_dirname", "."); else - filename = "testfile.nc"; - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for burst buffer big requests", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - - /* Initialize file info */ - MPI_Info_create(&info); - MPI_Info_set(info, "nc_burst_buf", "enable"); + MPI_Info_set(info, "nc_burst_buf_dirname", folder); + free(dup_out_path); /* Create new netcdf file */ - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR /* Define dimensions */ - err = ncmpi_def_dim(ncid, "X", np, dimid); CHECK_ERR - err = ncmpi_def_dim(ncid, "Y", NREQ, dimid + 1); CHECK_ERR + err = ncmpi_def_dim(ncid, "X", np, dimid); CHECK_ERR + err = ncmpi_def_dim(ncid, "Y", NREQ, dimid + 1); CHECK_ERR /* Define variable */ - err = ncmpi_def_var(ncid, "M", NC_INT, 2, dimid, &varid); CHECK_ERR + err = ncmpi_def_var(ncid, "M", NC_INT, 2, dimid, &varid); CHECK_ERR /* Switch to data mode */ - err = ncmpi_enddef(ncid); CHECK_ERR + err = ncmpi_enddef(ncid); CHECK_ERR + + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } /* Prepare Buffer */ reqs = (int*)malloc(sizeof(int) * NREQ); @@ -87,15 +79,22 @@ int main(int argc, char *argv[]) { for (i = 0; i < NREQ; i++) { start[0] = rank; start[1] = i; - err = ncmpi_iput_var1_int(ncid, varid, start, buffer + i, reqs + i); CHECK_ERR + err = ncmpi_iput_var1_int(ncid, varid, start, buffer + i, reqs + i); CHECK_ERR } - err = ncmpi_wait_all(ncid, NREQ, reqs, stat); CHECK_ERR + if (coll_io) + err = ncmpi_wait_all(ncid, NREQ, reqs, stat); + else + err = ncmpi_wait(ncid, NREQ, reqs, stat); + CHECK_ERR for (i = 0; i < NREQ; i++) { - err = stat[i]; CHECK_ERR + err = stat[i]; CHECK_ERR } for (i = 0; i < NREQ; i++) { if (reqs[i] != NC_REQ_NULL) { - printf("Error at line %d in %s: expecting reqs[%d] = NC_REQ_NULL but got %d\n", __LINE__, __FILE__, i, reqs[i]); + printf("Error at line %d in %s: expecting reqs[%d] = NC_REQ_NULL but got %d\n", + __LINE__, __FILE__, i, reqs[i]); + nerrs++; + goto err_out; } } @@ -104,48 +103,93 @@ int main(int argc, char *argv[]) { for (i = 0; i < NREQ; i++ ) { start[0] = rank; start[1] = i; - err = ncmpi_iget_var1_int(ncid, varid, start, buffer + i, reqs + i); CHECK_ERR + err = ncmpi_iget_var1_int(ncid, varid, start, buffer + i, reqs + i); CHECK_ERR } - err = ncmpi_wait_all(ncid, NREQ, reqs, stat); CHECK_ERR + if (coll_io) + err = ncmpi_wait_all(ncid, NREQ, reqs, stat); + else + err = ncmpi_wait(ncid, NREQ, reqs, stat); + CHECK_ERR for (i = 0; i < NREQ; i++) { - err = stat[i]; CHECK_ERR + err = stat[i]; CHECK_ERR } for (i = 0; i < NREQ; i++) { if (buffer[i] != rank + 1) { - printf("Error at line %d in %s: expecting buffer[%d] = %d but got %d\n", __LINE__, __FILE__, i, rank + 1, buffer[i]); + printf("Error at line %d in %s: expecting buffer[%d] = %d but got %d\n", + __LINE__, __FILE__, i, rank + 1, buffer[i]); + nerrs++; + goto err_out; } } for (i = 0; i < NREQ; i++) { if (reqs[i] != NC_REQ_NULL) { - printf("Error at line %d in %s: expecting reqs[%d] = NC_REQ_NULL but got %d\n", __LINE__, __FILE__, i, reqs[i]); + printf("Error at line %d in %s: expecting reqs[%d] = NC_REQ_NULL but got %d\n", + __LINE__, __FILE__, i, reqs[i]); + nerrs++; + goto err_out; } } } /* Close the file */ - err = ncmpi_close(ncid); CHECK_ERR + err = ncmpi_close(ncid); CHECK_ERR - MPI_Info_free(&info); free(buffer); free(reqs); free(stat); - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", sum_size); - } +err_out: + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR, nerrs); - else printf(PASS_STR); - } +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ + int err=NC_NOERR; + MPI_Info local_info; + + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + MPI_Info_dup(info, &local_info); + err = test_bb(out_path, coll_io, local_info); + MPI_Info_free(&local_info); + + MPI_Info_dup(info, &local_info); + MPI_Info_set(local_info, "nc_burst_buf_shared_logs", "enable"); + err = test_bb(out_path, coll_io, local_info); + MPI_Info_free(&local_info); + + return err; +} + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "big requests", opt, test_io); MPI_Finalize(); - return nerrs > 0; + return err; } diff --git a/test/burst_buffer/bb_nonblocking.c b/test/burst_buffer/bb_nonblocking.c index 57cd858022..0c9de7da53 100644 --- a/test/burst_buffer/bb_nonblocking.c +++ b/test/burst_buffer/bb_nonblocking.c @@ -18,111 +18,134 @@ #include #include -int main(int argc, char *argv[]) { - int err, tmp, nerrs = 0; - int rank, np; - int ncid, varid; +static +int test_bb(const char *out_path, + MPI_Info info) +{ + char *folder, *dup_out_path; + int err, tmp, nerrs = 0, rank, np, ncid, varid, dimid[2]; int buffer, req1, req2, stat; - int dimid[2]; - char *filename; MPI_Offset start[2]; - MPI_Info info; - /* Initialize MPI */ - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &np); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n", argv[0]); - MPI_Finalize(); - return 1; - } + /* add file info */ + MPI_Info_set(info, "nc_burst_buf", "enable"); + MPI_Info_set(info, "nc_burst_buf_overwrite", "enable"); - /* Determine test file name */ - if (argc > 1) - filename = argv[1]; + dup_out_path = strdup(out_path); + folder = dirname(dup_out_path); + if (folder == NULL) + MPI_Info_set(info, "nc_burst_buf_dirname", "."); else - filename = "testfile.nc"; - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for when requests are > buffer size", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - - /* Initialize file info */ - MPI_Info_create(&info); - MPI_Info_set(info, "nc_burst_buf", "enable"); + MPI_Info_set(info, "nc_burst_buf_dirname", folder); + free(dup_out_path); /* Create new netcdf file */ - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR /* Define dimensions */ - err = ncmpi_def_dim(ncid, "X", np, dimid); CHECK_ERR - err = ncmpi_def_dim(ncid, "Y", 4, dimid + 1); CHECK_ERR + err = ncmpi_def_dim(ncid, "X", np, dimid); CHECK_ERR + err = ncmpi_def_dim(ncid, "Y", 4, dimid + 1); CHECK_ERR /* Define variable */ - err = ncmpi_def_var(ncid, "M", NC_INT, 2, dimid, &varid); CHECK_ERR + err = ncmpi_def_var(ncid, "M", NC_INT, 2, dimid, &varid); CHECK_ERR + + err = ncmpi_set_fill(ncid, NC_FILL, NULL); CHECK_ERR /* Switch to data mode */ - err = ncmpi_enddef(ncid); CHECK_ERR + err = ncmpi_enddef(ncid); CHECK_ERR buffer = rank + 1; start[0] = 0; start[1] = 0; - err = ncmpi_iput_var1_int(ncid, varid, start, &buffer, &req1); CHECK_ERR + err = ncmpi_iput_var1_int(ncid, varid, start, &buffer, &req1); CHECK_ERR start[1] = 1; - err = ncmpi_iput_var1_int(ncid, varid, start, &buffer, &req2); CHECK_ERR + err = ncmpi_iput_var1_int(ncid, varid, start, &buffer, &req2); CHECK_ERR start[1] = 0; - err = ncmpi_get_var1_int_all(ncid, varid, start, &buffer); CHECK_ERR + err = ncmpi_get_var1_int_all(ncid, varid, start, &buffer); CHECK_ERR start[1] = 1; - err = ncmpi_get_var1_int_all(ncid, varid, start, &buffer); CHECK_ERR - err = ncmpi_cancel(ncid, 1, &req1, &stat); CHECK_ERR + err = ncmpi_get_var1_int_all(ncid, varid, start, &buffer); CHECK_ERR + err = ncmpi_cancel(ncid, 1, &req1, &stat); CHECK_ERR err = stat; EXP_ERR(NC_EFLUSHED) - err = ncmpi_wait_all(ncid, 1, &req2, &stat); CHECK_ERR - err = stat; CHECK_ERR + err = ncmpi_wait_all(ncid, 1, &req2, &stat); CHECK_ERR + err = stat; CHECK_ERR start[1] = 2; - err = ncmpi_iput_var1_int(ncid, varid, start, &buffer, &req1); CHECK_ERR + err = ncmpi_iput_var1_int(ncid, varid, start, &buffer, &req1); CHECK_ERR start[1] = 3; - err = ncmpi_iput_var1_int(ncid, varid, start, &buffer, &req2); CHECK_ERR + err = ncmpi_iput_var1_int(ncid, varid, start, &buffer, &req2); CHECK_ERR tmp = req1; - err = ncmpi_cancel(ncid, 1, &req1, &stat); CHECK_ERR - err = stat; CHECK_ERR + err = ncmpi_cancel(ncid, 1, &req1, &stat); CHECK_ERR + err = stat; CHECK_ERR start[1] = 2; - err = ncmpi_get_var1_int_all(ncid, varid, start, &buffer); CHECK_ERR + err = ncmpi_get_var1_int_all(ncid, varid, start, &buffer); CHECK_ERR start[1] = 3; - err = ncmpi_get_var1_int_all(ncid, varid, start, &buffer); CHECK_ERR + err = ncmpi_get_var1_int_all(ncid, varid, start, &buffer); CHECK_ERR req1 = tmp; - err = ncmpi_wait_all(ncid, 1, &req1, &stat); CHECK_ERR + err = ncmpi_wait_all(ncid, 1, &req1, &stat); CHECK_ERR err = stat; EXP_ERR(NC_EINVAL_REQUEST) - err = ncmpi_wait_all(ncid, 1, &req2, &stat); CHECK_ERR - err = stat; CHECK_ERR + err = ncmpi_wait_all(ncid, 1, &req2, &stat); CHECK_ERR + err = stat; CHECK_ERR /* Close the file */ - err = ncmpi_close(ncid); CHECK_ERR + err = ncmpi_close(ncid); CHECK_ERR - MPI_Info_free(&info); + return nerrs; +} + +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) +{ + int err=NC_NOERR; + MPI_Info local_info; + + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + MPI_Info_dup(info, &local_info); + err = test_bb(out_path, local_info); + MPI_Info_free(&local_info); + + MPI_Info_dup(info, &local_info); + MPI_Info_set(local_info, "nc_burst_buf_shared_logs", "enable"); + err = test_bb(out_path, local_info); + MPI_Info_free(&local_info); + + MPI_Barrier(MPI_COMM_WORLD); + return err; +} + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", sum_size); - } + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR, nerrs); - else printf(PASS_STR); - } + err = tst_main(argc, argv, "nonblocking APIs", opt, test_io); MPI_Finalize(); - return nerrs > 0; + return err; } diff --git a/test/burst_buffer/highdim.c b/test/burst_buffer/highdim.c index 6957f6c0f1..fe27a6176d 100644 --- a/test/burst_buffer/highdim.c +++ b/test/burst_buffer/highdim.c @@ -58,44 +58,21 @@ #define BSIZE 1024 * 1024 -int main(int argc, char *argv[]) +static +int test_bb(const char *out_path, + int coll_io, + MPI_Info info) { - char *filename=NULL, dimname[64]; - int i, ret=NC_NOERR, nerr=0; - int rank, np, ndims; - int ncid, varid; + char *folder, *dup_out_path, dimname[64]; + int i, err=NC_NOERR, nerrs=0, rank, np, ndims, ncid, varid; int *dimid=NULL, *buffer=NULL; long long j; MPI_Offset *start=NULL, *count=NULL, *stride=NULL; - /* Initialize MPI */ - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &np); - if (argc > 3) { - if (!rank) printf("Usage: %s [filename]\n", argv[0]); - MPI_Finalize(); - return 1; - } - - /* Determine ndims and test file name */ - if (argc > 1) - filename = strdup(argv[1]); - else - filename = strdup("testfile.nc"); - ndims = DIM; - if (argc > 2) - ndims = atoi(argv[2]); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for high dimensional variables", basename(argv[0])); - printf("%-66s ------ ", cmd_str); - fflush(stdout); - free(cmd_str); - } /* Allocate buffers */ dimid = (int*)malloc(sizeof(int) * ndims); @@ -105,8 +82,8 @@ int main(int argc, char *argv[]) buffer = (int*)malloc(sizeof(int) * BSIZE); if (dimid == NULL || start == NULL || count == NULL || stride == NULL || buffer == NULL) { printf("Error at line %d in %s: malloc error\n", __LINE__, __FILE__); - nerr++; - goto ERROR; + nerrs++; + goto err_out; } /* Initialize buffers and calculate share among processes @@ -133,104 +110,131 @@ int main(int argc, char *argv[]) start[0] = rank; stride[0] = np; + MPI_Info_set(info, "nc_burst_buf", "enable"); + MPI_Info_set(info, "nc_burst_buf_overwrite", "enable"); + + dup_out_path = strdup(out_path); + folder = dirname(dup_out_path); + if (folder == NULL) + MPI_Info_set(info, "nc_burst_buf_dirname", "."); + else + MPI_Info_set(info, "nc_burst_buf_dirname", folder); + free(dup_out_path); + /* Create new netcdf file */ - ret = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, MPI_INFO_NULL, &ncid); - if (ret != NC_NOERR) { - printf("Error at line %d in %s: ncmpi_create: %d\n", __LINE__, __FILE__, ret); - nerr++; - goto ERROR; - } + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR /* Define dimensions */ for (i = 0; i < ndims; i++) { sprintf(dimname, "D%d", i); /* Submatrix of each process stack along the first dimension */ - if (i == 0) { - ret = ncmpi_def_dim(ncid, dimname, count[i] * np, dimid + i); - } - else{ - ret = ncmpi_def_dim(ncid, dimname, count[i], dimid + i); - } - if (ret != NC_NOERR) { - printf("Error at line %d in %s: ncmpi_enddef: %d\n", __LINE__, __FILE__, ret); - nerr++; - goto ERROR; - } + if (i == 0) + err = ncmpi_def_dim(ncid, dimname, count[i] * np, dimid + i); + else + err = ncmpi_def_dim(ncid, dimname, count[i], dimid + i); + CHECK_ERR } /* Define variable */ - ret = ncmpi_def_var(ncid, "M", NC_INT, ndims, dimid, &varid); - if (ret != NC_NOERR) { - printf("Error at line %d in %s: ncmpi_def_var: %d\n", __LINE__, __FILE__, ret); - nerr++; - goto ERROR; - } + err = ncmpi_def_var(ncid, "M", NC_INT, ndims, dimid, &varid); + CHECK_ERR /* Switch to data mode */ - ret = ncmpi_enddef(ncid); - if (ret != NC_NOERR) { - printf("Error at line %d in %s: ncmpi_enddef: %d\n", __LINE__, __FILE__, ret); - nerr++; - goto ERROR; + err = ncmpi_enddef(ncid); + CHECK_ERR + + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR } /* Write variable */ - ret = ncmpi_put_vars_int_all(ncid, varid, start, count, stride, buffer); - if (ret != NC_NOERR) { - printf("Error at line %d in %s: ncmpi_put_vars_int: %d\n", __LINE__, __FILE__, ret); - nerr++; - goto ERROR; - } + if (coll_io) + err = ncmpi_put_vars_int_all(ncid, varid, start, count, stride, buffer); + else + err = ncmpi_put_vars_int(ncid, varid, start, count, stride, buffer); + CHECK_ERR /* Read it back */ - ret = ncmpi_get_vars_int_all(ncid, varid, start, count, stride, buffer); - if (ret != NC_NOERR) { - printf("Error at line %d in %s: ncmpi_get_vars_int: %d\n", __LINE__, __FILE__, ret); - nerr++; - goto ERROR; - } + if (coll_io) + err = ncmpi_get_vars_int_all(ncid, varid, start, count, stride, buffer); + else + err = ncmpi_get_vars_int(ncid, varid, start, count, stride, buffer); + CHECK_ERR /* Verify the result */ for (i = 0; i < BSIZE; i++) { if (buffer[i] != rank + 1) { - nerr++; - printf("Error at line %d in %s: expecting buffer[%d] = %d but got %d\n", __LINE__, __FILE__, i, rank + 1, buffer[i]); + printf("Error at line %d in %s: expecting buffer[%d] = %d but got %d\n", + __LINE__, __FILE__, i, rank + 1, buffer[i]); + nerrs++; + goto err_out; } } /* Close the file */ - ret = ncmpi_close(ncid); - if (ret != NC_NOERR) { - printf("Error at line %d in %s: ncmpi_close: %d\n", __LINE__, __FILE__, ret); - nerr++; - goto ERROR; - } - - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - ret = ncmpi_inq_malloc_size(&malloc_size); - if (ret == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", sum_size); - } - -ERROR: - MPI_Allreduce(MPI_IN_PLACE, &nerr, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerr) printf(FAIL_STR, nerr); - else printf(PASS_STR); - } + err = ncmpi_close(ncid); + CHECK_ERR if (start != NULL) free(start); if (count != NULL) free(count); if (stride != NULL) free(stride); if (dimid != NULL) free(dimid); if (buffer != NULL) free(buffer); - if (filename != NULL) free(filename); + +err_out: + return nerrs; +} + +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ + int err=NC_NOERR; + MPI_Info local_info; + + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + MPI_Info_dup(info, &local_info); + err = test_bb(out_path, coll_io, local_info); + MPI_Info_free(&local_info); + + MPI_Info_dup(info, &local_info); + MPI_Info_set(local_info, "nc_burst_buf_shared_logs", "enable"); + err = test_bb(out_path, coll_io, local_info); + MPI_Info_free(&local_info); + + return err; +} + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "high dimensional variables", opt, test_io); MPI_Finalize(); - return nerr > 0; + return err; } diff --git a/test/burst_buffer/parallel_run.sh b/test/burst_buffer/parallel_run.sh index d726ee7601..da174c4a67 100755 --- a/test/burst_buffer/parallel_run.sh +++ b/test/burst_buffer/parallel_run.sh @@ -1,24 +1,30 @@ #!/bin/bash # -# Copyright (C) 2018, Northwestern University and Argonne National Laboratory +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. # # Exit immediately if a command exits with a non-zero status. set -e -VALIDATOR=../../src/utils/ncvalidator/ncvalidator -NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff +DRY_RUN=no +VERBOSE=no -# remove file system type prefix if there is any -OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $MPIRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $MPIRUN $@ + fi +} MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "MPIRUN = ${MPIRUN}" -# echo "TESTPROGRAMS=${TESTPROGRAMS}" +# echo "check_PROGRAMS=${check_PROGRAMS}" -# echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -27,40 +33,24 @@ fi # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS -for i in ${TESTPROGRAMS} ; do - for j in ${safe_modes} ; do - for intra_aggr in 0 1 ; do - if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" - else - export PNETCDF_HINTS= - fi - if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" - fi - export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" +if test "x$MIMIC_LUSTRE" != x1 ; then + PNETCDF_HINTS="cb_nodes=2" +fi + +for i in ${check_PROGRAMS} ; do + + exe_name=`basename $i` + + for j in ${safe_modes} ; do - saved_PNETCDF_HINTS=${PNETCDF_HINTS} - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" - rm -f ${OUTDIR}/$i.nc - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc - export PNETCDF_HINTS=${saved_PNETCDF_HINTS} + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi - # echo "--- validating file ${TESTOUTDIR}/$i.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc + run_cmd ./$i -q -o ${TESTOUTDIR}/${exe_name}.nc - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable;nc_burst_buf_shared_logs=enable" - rm -f ${OUTDIR}/$i.nc - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc - export PNETCDF_HINTS=${saved_PNETCDF_HINTS} + done # safe_modes - # echo "--- validating file ${TESTOUTDIR}/$i.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc - done - done - rm -f ${OUTDIR}/$i.nc - rm -f ${OUTDIR}/$i.nc*.data - rm -f ${OUTDIR}/$i.nc*.meta -done +done # check_PROGRAMS diff --git a/test/burst_buffer/seq_runs.sh b/test/burst_buffer/seq_runs.sh new file mode 100755 index 0000000000..205d1fb396 --- /dev/null +++ b/test/burst_buffer/seq_runs.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory +# See COPYRIGHT notice in top-level directory. +# + +# Exit immediately if a command exits with a non-zero status. +set -e + +DRY_RUN=no +VERBOSE=no + +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $TESTSEQRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $TESTSEQRUN $@ + fi +} + +if test "x${PNETCDF_DEBUG}" = x1 ; then + safe_modes="0 1" +else + safe_modes="0" +fi + +exe_name=`basename $1` + +# prevent user environment setting of PNETCDF_HINTS to interfere +unset PNETCDF_HINTS + +for j in ${safe_modes} ; do + + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi + + run_cmd ./$1 -q -o ${TESTOUTDIR}/${exe_name}.nc + +done # safe_modes + diff --git a/test/burst_buffer/varn.c b/test/burst_buffer/varn.c index 81e93063e9..42a81d2764 100644 --- a/test/burst_buffer/varn.c +++ b/test/burst_buffer/varn.c @@ -18,58 +18,49 @@ #include #include -int main(int argc, char *argv[]) { - int i; - int err, nerrs = 0; - int rank, np; - int ncid, varid; - int dimid[2]; - int buffer[10]; - char *filename; +static +int test_bb(const char *out_path, + int coll_io, + MPI_Info info) +{ + char *folder, *dup_out_path; + int i, err, nerrs = 0, rank, np, ncid, varid, dimid[2], buffer[10]; MPI_Offset starts[10][2], counts[10][2]; MPI_Offset *Starts[10], *Counts[10]; - MPI_Info info; - /* Initialize MPI */ - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &np); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n", argv[0]); - MPI_Finalize(); - return 1; - } + /* add file info */ + MPI_Info_set(info, "nc_burst_buf", "enable"); + MPI_Info_set(info, "nc_burst_buf_overwrite", "enable"); - /* Determine test file name */ - if (argc > 1) - filename = argv[1]; + dup_out_path = strdup(out_path); + folder = dirname(dup_out_path); + if (folder == NULL) + MPI_Info_set(info, "nc_burst_buf_dirname", "."); else - filename = "testfile.nc"; - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for when requests are > buffer size", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - - /* Initialize file info */ - MPI_Info_create(&info); - MPI_Info_set(info, "nc_burst_buf", "enable"); + MPI_Info_set(info, "nc_burst_buf_dirname", folder); + free(dup_out_path); /* Create new netcdf file */ - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR /* Define dimensions */ - err = ncmpi_def_dim(ncid, "X", np, dimid); CHECK_ERR - err = ncmpi_def_dim(ncid, "Y", 10, dimid + 1); CHECK_ERR + err = ncmpi_def_dim(ncid, "X", np, dimid); CHECK_ERR + err = ncmpi_def_dim(ncid, "Y", 10, dimid + 1); CHECK_ERR /* Define variable */ - err = ncmpi_def_var(ncid, "M", NC_INT, 2, dimid, &varid); CHECK_ERR + err = ncmpi_def_var(ncid, "M", NC_INT, 2, dimid, &varid); CHECK_ERR /* Switch to data mode */ - err = ncmpi_enddef(ncid); CHECK_ERR + err = ncmpi_enddef(ncid); CHECK_ERR + + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } for(i = 0; i < 10; i++){ starts[i][0] = rank; @@ -82,22 +73,46 @@ int main(int argc, char *argv[]) { } /* Standard varn */ - err = ncmpi_put_varn_int_all(ncid, varid, 10, Starts, Counts, buffer); CHECK_ERR - err = ncmpi_get_varn_int_all(ncid, varid, 10, Starts, Counts, buffer); CHECK_ERR + if (coll_io) + err = ncmpi_put_varn_int_all(ncid, varid, 10, Starts, Counts, buffer); + else + err = ncmpi_put_varn_int(ncid, varid, 10, Starts, Counts, buffer); + CHECK_ERR + + for (i=0; i<10; i++) buffer[0] = -1; + if (coll_io) + err = ncmpi_get_varn_int_all(ncid, varid, 10, Starts, Counts, buffer); + else + err = ncmpi_get_varn_int(ncid, varid, 10, Starts, Counts, buffer); + CHECK_ERR for(i = 0; i < 10; i++){ if (buffer[i] != rank + i){ + printf("Error at line %d in %s: expecting buffer[%d] = %d but got %d\n", + __LINE__, __FILE__, i, rank + 1, buffer[i]); nerrs++; - printf("Error at line %d in %s: expecting buffer[%d] = %d but got %d\n", __LINE__, __FILE__, i, rank + 1, buffer[i]); + goto err_out; } } /* NULL counts */ - err = ncmpi_put_varn_int_all(ncid, varid, 10, Starts, NULL, buffer); CHECK_ERR - err = ncmpi_get_varn_int_all(ncid, varid, 10, Starts, NULL, buffer); CHECK_ERR + if (coll_io) + err = ncmpi_put_varn_int_all(ncid, varid, 10, Starts, NULL, buffer); + else + err = ncmpi_put_varn_int(ncid, varid, 10, Starts, NULL, buffer); + CHECK_ERR + + for (i=0; i<10; i++) buffer[0] = -1; + if (coll_io) + err = ncmpi_get_varn_int_all(ncid, varid, 10, Starts, NULL, buffer); + else + err = ncmpi_get_varn_int(ncid, varid, 10, Starts, NULL, buffer); + CHECK_ERR for(i = 0; i < 10; i++){ if (buffer[i] != rank + i){ + printf("Error at line %d in %s: expecting buffer[%d] = %d but got %d\n", + __LINE__, __FILE__, i, rank + 1, buffer[i]); nerrs++; - printf("Error at line %d in %s: expecting buffer[%d] = %d but got %d\n", __LINE__, __FILE__, i, rank + 1, buffer[i]); + goto err_out; } } @@ -105,36 +120,82 @@ int main(int argc, char *argv[]) { for(i = 0; i < 10; i += 2){ Counts[i] = (MPI_Offset*)counts[i]; } - err = ncmpi_put_varn_int_all(ncid, varid, 10, Starts, Counts, buffer); CHECK_ERR - err = ncmpi_get_varn_int_all(ncid, varid, 10, Starts, Counts, buffer); CHECK_ERR + if (coll_io) + err = ncmpi_put_varn_int_all(ncid, varid, 10, Starts, Counts, buffer); + else + err = ncmpi_put_varn_int(ncid, varid, 10, Starts, Counts, buffer); + CHECK_ERR + + for (i=0; i<10; i++) buffer[0] = -1; + if (coll_io) + err = ncmpi_get_varn_int_all(ncid, varid, 10, Starts, Counts, buffer); + else + err = ncmpi_get_varn_int(ncid, varid, 10, Starts, Counts, buffer); + CHECK_ERR for(i = 0; i < 10; i++){ if (buffer[i] != rank + i){ + printf("Error at line %d in %s: expecting buffer[%d] = %d but got %d\n", + __LINE__, __FILE__, i, rank + 1, buffer[i]); nerrs++; - printf("Error at line %d in %s: expecting buffer[%d] = %d but got %d\n", __LINE__, __FILE__, i, rank + 1, buffer[i]); + goto err_out; } } /* Close the file */ - err = ncmpi_close(ncid); CHECK_ERR + err = ncmpi_close(ncid); CHECK_ERR - MPI_Info_free(&info); +err_out: + return nerrs; +} - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", sum_size); - } +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ + int err=NC_NOERR; + MPI_Info local_info; + + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + MPI_Info_dup(info, &local_info); + err = test_bb(out_path, coll_io, local_info); + MPI_Info_free(&local_info); + + MPI_Info_dup(info, &local_info); + MPI_Info_set(local_info, "nc_burst_buf_shared_logs", "enable"); + err = test_bb(out_path, coll_io, local_info); + MPI_Info_free(&local_info); + + return err; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR, nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "varn API", opt, test_io); MPI_Finalize(); - return nerrs > 0; + return err; } diff --git a/test/burst_buffer/wrap_runs.sh b/test/burst_buffer/wrap_runs.sh index 308ccfc19a..b4f5587ad2 100755 --- a/test/burst_buffer/wrap_runs.sh +++ b/test/burst_buffer/wrap_runs.sh @@ -14,7 +14,7 @@ outfile=`basename $1` OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -24,13 +24,39 @@ fi unset PNETCDF_HINTS for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi +for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + else + USEMPIO_HINTS="nc_pncio=enable" + fi + +for bb_mode in 1 ; do + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + if test "$bb_mode" = 1 ; then + PNETCDF_HINTS="$PNETCDF_HINTS;nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" + fi + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" - export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.nc unset PNETCDF_HINTS ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.nc done +done +done rm -f ${OUTDIR}/$outfile.nc rm -f ${OUTDIR}/$outfile.nc_0_0.data diff --git a/test/cdf_format/Makefile.am b/test/cdf_format/Makefile.am index a8d18918fe..c197821679 100644 --- a/test/cdf_format/Makefile.am +++ b/test/cdf_format/Makefile.am @@ -2,8 +2,6 @@ # Copyright (C) 2003, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. # -# $Id$ -# # @configure_input@ SUFFIXES = .o .c @@ -11,8 +9,10 @@ SUFFIXES = .o .c AM_CPPFLAGS = -I$(top_srcdir)/src/include AM_CPPFLAGS += -I$(srcdir)/../common AM_CPPFLAGS += -I$(top_builddir)/src/include +AM_CPPFLAGS += -I$(top_srcdir)/src/utils/ncmpidiff LDADD = $(top_builddir)/src/libs/libpnetcdf.la ../common/libtestutils.la +LDADD += $(top_builddir)/src/utils/ncmpidiff/libncmpidiff_core.la LDADD += @NETCDF4_LDFLAGS@ @ADIOS_LDFLAGS@ @NETCDF4_LIBS@ @ADIOS_LIBS@ if DECL_MPI_OFFSET @@ -22,15 +22,15 @@ if DECL_MPI_OFFSET # AM_FFLAGS += $(FC_DEFINE)HAVE_DECL_MPI_OFFSET endif -TESTPROGRAMS = test_inq_format \ - cdf_type \ - dim_cdf12 - -check_PROGRAMS = $(TESTPROGRAMS) tst_open_cdf5 tst_corrupt +check_PROGRAMS = test_inq_format \ + cdf_type \ + dim_cdf12 \ + tst_open_cdf5 \ + tst_corrupt # autimake 1.11.3 has not yet implemented AM_TESTS_ENVIRONMENT # For newer versions, we can use AM_TESTS_ENVIRONMENT instead -# AM_TESTS_ENVIRONMENT = export TESTPROGRAMS="$(TESTPROGRAMS)"; +# AM_TESTS_ENVIRONMENT = export check_PROGRAMS="$(check_PROGRAMS)"; # AM_TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; # AM_TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT = export SED="$(SED)"; @@ -38,15 +38,27 @@ TESTS_ENVIRONMENT += export srcdir="$(srcdir)"; TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; TESTS_ENVIRONMENT += export TESTMPIRUN="$(TESTMPIRUN)"; -TESTS_ENVIRONMENT += export PNETCDF_DEBUG="$(PNETCDF_DEBUG)"; -TESTS_ENVIRONMENT += export TESTPROGRAMS="$(TESTPROGRAMS)"; TESTS_ENVIRONMENT += export check_PROGRAMS="$(check_PROGRAMS)"; -TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER="$(ENABLE_BURST_BUFFER)"; TESTS_ENVIRONMENT += export BAD_FILES="$(BAD_FILES)"; -TESTS = cdf_type dim_cdf12 seq_runs.sh xfail_runs.sh +if PNETCDF_DEBUG + TESTS_ENVIRONMENT += export PNETCDF_DEBUG=1; +endif +if ENABLE_BURST_BUFFER + TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER=1; +endif +if ENABLE_THREAD_SAFE + TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE=1; +endif +if MIMIC_LUSTRE + TESTS_ENVIRONMENT += export MIMIC_LUSTRE=1; +endif + +# During "make check", programs set in TESTS will be run by +# seq_runs.sh individually. Each produces its own log file. +TESTS = $(check_PROGRAMS) xfail_runs.sh TEST_EXTENSIONS = .sh -LOG_COMPILER = $(srcdir)/wrap_runs.sh +LOG_COMPILER = $(srcdir)/seq_runs.sh SH_LOG_COMPILER = BAD_FILES = bad_begin.nc5 \ @@ -57,20 +69,17 @@ BAD_FILES = bad_begin.nc5 \ XFAIL_TESTS = xfail_runs.sh -CLEANFILES = $(TESTOUTDIR)/cdf_type.nc \ - $(TESTOUTDIR)/cdf_type.bb.nc \ - $(TESTOUTDIR)/dim_cdf12.nc \ - $(TESTOUTDIR)/dim_cdf12.bb.nc \ +CLEANFILES = $(TESTOUTDIR)/*.nc $(TESTOUTDIR)/*bb \ core core.* *.gcda *.gcno *.gcov gmon.out -EXTRA_DIST = wrap_runs.sh seq_runs.sh xfail_runs.sh parallel_run.sh \ - test_cdf1.nc test_cdf2.nc test_cdf5.nc \ - test_netcdf4.nc $(BAD_FILES) +EXTRA_DIST = seq_runs.sh xfail_runs.sh parallel_run.sh \ + test_cdf.nc1 test_cdf.nc2 test_cdf.nc3 test_cdf.nc4 test_cdf.nc5 \ + $(BAD_FILES) ../common/libtestutils.la: set -e; cd ../common && $(MAKE) $(MFLAGS) tests -ptest ptests ptest4: $(TESTPROGRAMS) +ptest ptests ptest4: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 4 MPI processes" @echo "===========================================================" diff --git a/test/cdf_format/cdf_type.c b/test/cdf_format/cdf_type.c index 6d66e6aa3d..3a77be289f 100644 --- a/test/cdf_format/cdf_type.c +++ b/test/cdf_format/cdf_type.c @@ -101,8 +101,8 @@ int test_attr_types(const char *filename, /*----< test_var_types() >----------------------------------------------------*/ static -int test_var_types(char *filename, - int format) +int test_var_types(const char *filename, + int format) { int i, err, rank, ncid, cmode, nerrs=0; int dimid, varid[5]; @@ -137,6 +137,58 @@ int test_var_types(char *filename, return nerrs; } +/*----< test_io() >----------------------------------------------------------*/ +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, /* ignored */ + int coll_io, /* ignored */ + MPI_Info info) +{ + int nerrs; + + nerrs = test_attr_types(out_path, 0); /* CDF-1 */ + if (nerrs > 0) return nerrs; + + nerrs = test_attr_types(out_path, NC_64BIT_OFFSET); /* CDF-2 */ + if (nerrs > 0) return nerrs; + + nerrs = test_var_types(out_path, 0); /* CDF-1 */ + if (nerrs > 0) return nerrs; + + nerrs = test_var_types(out_path, NC_64BIT_OFFSET); /* CDF-2 */ + if (nerrs > 0) return nerrs; + + return 0; +} + +#ifndef TEST_NETCDF +/*----< main() >--------------------------------------------------------------*/ +int main(int argc, char **argv) { + + int err, formats[] = {0}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 0; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "CDF-5 dtype in CDF-1 and 2", opt, test_io); + + MPI_Finalize(); + + return err; +} +#else /*----< main() >--------------------------------------------------------------*/ int main(int argc, char **argv) { @@ -167,27 +219,10 @@ int main(int argc, char **argv) nerrs += test_var_types(filename, 0); nerrs += test_var_types(filename, NC_64BIT_OFFSET); -#ifdef TEST_NETCDF if (nerrs) printf("fail with %d mismatches\n",nerrs); else printf("pass\n"); -#else - MPI_Offset malloc_size, sum_size; - int err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } - - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } -#endif MPI_Finalize(); return (nerrs > 0); } - +#endif diff --git a/test/cdf_format/dim_cdf12.c b/test/cdf_format/dim_cdf12.c index cc8be2599c..31c3572811 100644 --- a/test/cdf_format/dim_cdf12.c +++ b/test/cdf_format/dim_cdf12.c @@ -4,7 +4,6 @@ * See COPYRIGHT notice in top-level directory. * *********************************************************************/ -/* $Id$ */ /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * @@ -59,64 +58,49 @@ #include #include -int main(int argc, char** argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, /* ignored */ + int coll_io, /* ignored */ + MPI_Info info) { - char filename[256]; int rank, nprocs, err, nerrs=0; int ncid, cmode, varid, dimid[3]; - MPI_Info info=MPI_INFO_NULL; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* Note this test program must use the 512-byte alignment setting */ - MPI_Info_create(&info); + /* use the 512-byte fixed-size variable starting file offset alignment, * which is also the header extent align size */ MPI_Info_set(info, "nc_var_align_size", "512"); - /* get command-line arguments */ - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for defining dim in CDF-1/2 format ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - /* create a new CDF-1 file ----------------------------------------------*/ cmode = NC_CLOBBER; /* max dimension size for CDF-1 file is NC_MAX_INT */ - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, cmode, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "Y", (MPI_Offset)1+NC_MAX_INT, &dimid[0]); EXP_ERR(NC_EDIMSIZE) err = ncmpi_def_dim(ncid, "Y", NC_MAX_INT, &dimid[0]); CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, info, &ncid); CHECK_ERR + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR /* use the max dimension size to define a 1D variable */ - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, cmode, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "Y", NC_MAX_INT, &dimid[0]); CHECK_ERR err = ncmpi_def_var(ncid, "var0", NC_CHAR, 1, dimid, &varid); CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, info, &ncid); CHECK_ERR + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR /* use the max dimension size to define a 1D variable, followed by * another variable to make the file size > NC_MAX_INT */ - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, cmode, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "Y", NC_MAX_INT, &dimid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "X", 2, &dimid[1]); CHECK_ERR err = ncmpi_def_var(ncid, "var0", NC_CHAR, 1, &dimid[0], &varid); CHECK_ERR @@ -127,19 +111,19 @@ int main(int argc, char** argv) /* use the max dimension size - 1024 to define a 1D variable, followed * by another variable to make the file size < 2147483647 */ - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, cmode, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "Y", NC_MAX_INT-1024, &dimid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "X", 2, &dimid[1]); CHECK_ERR err = ncmpi_def_var(ncid, "var0", NC_CHAR, 1, &dimid[0], &varid); CHECK_ERR err = ncmpi_def_var(ncid, "var1", NC_INT, 1, &dimid[1], &varid); CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, info, &ncid); CHECK_ERR + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR /* define the first variable of type short that makes the file size > * NC_MAX_INT. error should be reported in ncmpi_enddef() or * ncmpi_close() */ - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, cmode, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "Y", NC_MAX_INT, &dimid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "X", 2, &dimid[1]); CHECK_ERR err = ncmpi_def_var(ncid, "var0", NC_SHORT, 1, &dimid[0], &varid); CHECK_ERR @@ -148,7 +132,7 @@ int main(int argc, char** argv) EXP_ERR(NC_EVARSIZE) /* define two variables to make the file size just < 2147483647 */ - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, cmode, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "Y", NC_MAX_INT-512-8, &dimid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "X", 2, &dimid[1]); CHECK_ERR err = ncmpi_def_var(ncid, "var0", NC_CHAR, 1, &dimid[0], &varid); CHECK_ERR @@ -156,7 +140,7 @@ int main(int argc, char** argv) err = ncmpi_close(ncid); CHECK_ERR /* define two variables to make the file size just > NC_MAX_INT */ - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, cmode, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "Y", NC_MAX_INT/2+1, &dimid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "X", 2, &dimid[1]); CHECK_ERR err = ncmpi_def_var(ncid, "var0", NC_INT, 1, &dimid[0], &varid); CHECK_ERR @@ -168,37 +152,37 @@ int main(int argc, char** argv) cmode = NC_CLOBBER | NC_64BIT_OFFSET; /* max dimension size for CDF-2 file is NC_MAX_INT */ - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, cmode, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "Y", (MPI_Offset)1+NC_MAX_INT, &dimid[0]); EXP_ERR(NC_EDIMSIZE) err = ncmpi_def_dim(ncid, "Y", NC_MAX_INT, &dimid[0]); CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, info, &ncid); CHECK_ERR + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR /* use the max dimension size to define a 1D variable */ - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, cmode, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "Y", NC_MAX_INT, &dimid[0]); CHECK_ERR err = ncmpi_def_var(ncid, "var0", NC_CHAR, 1, dimid, &varid); CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, info, &ncid); CHECK_ERR + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR /* use the max dimension size to define a 1D variable, followed by * another variable to make the file size > 2 * NC_MAX_INT */ - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, cmode, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "Y", NC_MAX_INT, &dimid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "X", 2, &dimid[1]); CHECK_ERR err = ncmpi_def_var(ncid, "var0", NC_CHAR, 1, &dimid[0], &varid); CHECK_ERR err = ncmpi_def_var(ncid, "var1", NC_INT, 1, &dimid[1], &varid); CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, info, &ncid); CHECK_ERR + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR /* define the first variable of type short that makes the file size > * 4294967295. error should be reported in ncmpi_enddef() or * ncmpi_close() */ - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, cmode, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "Y", NC_MAX_INT, &dimid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "X", 2, &dimid[1]); CHECK_ERR err = ncmpi_def_var(ncid, "var0", NC_SHORT, 1, &dimid[0], &varid); CHECK_ERR @@ -207,7 +191,7 @@ int main(int argc, char** argv) EXP_ERR(NC_EVARSIZE) /* define 2 1D int variables of dimension size > NC_MAX_INT */ - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, cmode, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "Y", NC_MAX_INT, &dimid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "X", 2, &dimid[1]); CHECK_ERR err = ncmpi_def_var(ncid, "var0", NC_INT, 1, &dimid[0], &varid); CHECK_ERR @@ -215,7 +199,7 @@ int main(int argc, char** argv) err = ncmpi_close(ncid); EXP_ERR(NC_EVARSIZE) - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, cmode, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "Y", NC_MAX_INT/2+1, &dimid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "X", 2, &dimid[1]); CHECK_ERR err = ncmpi_def_var(ncid, "var0", NC_INT, 1, &dimid[0], &varid); CHECK_ERR @@ -226,7 +210,7 @@ int main(int argc, char** argv) /* No record variable can require more than 2^32 - 4 bytes of storage for * each record's worth of data, unless it is the last record variable. */ - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, cmode, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "Z", NC_UNLIMITED, &dimid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "Y", NC_MAX_INT/64, &dimid[1]); CHECK_ERR err = ncmpi_def_dim(ncid, "X", 64, &dimid[2]); CHECK_ERR @@ -236,7 +220,7 @@ int main(int argc, char** argv) EXP_ERR(NC_EVARSIZE) /* test large record variable that is not defined last */ - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, cmode, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "Z", NC_UNLIMITED, &dimid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "Y", NC_MAX_INT/64, &dimid[1]); CHECK_ERR err = ncmpi_def_dim(ncid, "X", 64, &dimid[2]); CHECK_ERR @@ -248,35 +232,41 @@ int main(int argc, char** argv) /* Note for developers: keep the last test that produces no error, so the * output file can be tested by ncvalidator in wrap_runs.sh */ - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, cmode, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "Y", NC_MAX_INT/2, &dimid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "X", 2, &dimid[1]); CHECK_ERR err = ncmpi_def_var(ncid, "var0", NC_INT, 1, &dimid[0], &varid); CHECK_ERR err = ncmpi_def_var(ncid, "var1", NC_INT, 1, &dimid[1], &varid); CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, info, &ncid); CHECK_ERR + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR - MPI_Info_free(&info); - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } - - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + return nerrs; +} + +/*----< main() >--------------------------------------------------------------*/ +int main(int argc, char **argv) { + + int err, formats[] = {0}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 0; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "defining dim in CDF-1/2 format", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/cdf_format/parallel_run.sh b/test/cdf_format/parallel_run.sh index 9f95d0813d..d431b282e9 100755 --- a/test/cdf_format/parallel_run.sh +++ b/test/cdf_format/parallel_run.sh @@ -1,25 +1,30 @@ #!/bin/bash # -# Copyright (C) 2018, Northwestern University and Argonne National Laboratory +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. # # Exit immediately if a command exits with a non-zero status. set -e -VALIDATOR=../../src/utils/ncvalidator/ncvalidator -NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff +DRY_RUN=no +VERBOSE=no -# remove file system type prefix if there is any -OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $MPIRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $MPIRUN $@ + fi +} MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "MPIRUN = ${MPIRUN}" # echo "check_PROGRAMS=${check_PROGRAMS}" -# echo "srcdir = ${srcdir}" -# echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -28,51 +33,35 @@ fi # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS -for j in ${safe_modes} ; do -for intra_aggr in 0 1 ; do - if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" - else - export PNETCDF_HINTS= - fi - if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" - fi - export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" - ${MPIRUN} ./test_inq_format ${srcdir} - ${MPIRUN} ./cdf_type ${TESTOUTDIR}/cdf_type.nc - ${MPIRUN} ./dim_cdf12 ${TESTOUTDIR}/dim_cdf12.nc - - # echo "--- validating file ${TESTOUTDIR}/cdf_type.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/cdf_type.nc - # echo "--- validating file ${TESTOUTDIR}/dim_cdf12.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/dim_cdf12.nc - - if test "x${ENABLE_BURST_BUFFER}" = x1 ; then - # echo "---- test burst buffering feature" - saved_PNETCDF_HINTS=${PNETCDF_HINTS} - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" - ${MPIRUN} ./test_inq_format ${srcdir} - ${MPIRUN} ./cdf_type ${TESTOUTDIR}/cdf_type.bb.nc - ${MPIRUN} ./dim_cdf12 ${TESTOUTDIR}/dim_cdf12.bb.nc - export PNETCDF_HINTS=${saved_PNETCDF_HINTS} - - # echo "--- validating file ${TESTOUTDIR}/cdf_type.bb.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/cdf_type.bb.nc - # echo "--- validating file ${TESTOUTDIR}/dim_cdf12.bb.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/dim_cdf12.bb.nc - - # echo "--- ncmpidiff cdf_type.nc cdf_type.bb.nc ---" - ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/cdf_type.nc ${TESTOUTDIR}/cdf_type.bb.nc - # echo "--- ncmpidiff dim_cdf12.nc dim_cdf12.bb.nc ---" - # ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/dim_cdf12.nc ${TESTOUTDIR}/dim_cdf12.bb.nc - fi -done -done - -rm -f ${OUTDIR}/dim_cdf12.nc -rm -f ${OUTDIR}/cdf_type.nc -rm -f ${OUTDIR}/dim_cdf12.bb.nc -rm -f ${OUTDIR}/cdf_type.bb.nc +if test "x$MIMIC_LUSTRE" != x1 ; then + PNETCDF_HINTS="cb_nodes=2" +fi + +for i in ${check_PROGRAMS} ; do + + exe_name=`basename $i` + + for j in ${safe_modes} ; do + + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi + + if test "x$exe_name" = "xtest_inq_format" ; then + run_cmd ./$i -q -i ${srcdir} + continue + elif test "x$exe_name" = "xtst_corrupt" ; then + run_cmd ./$i ${srcdir} + continue + elif test "x$exe_name" = "xtst_open_cdf5" ; then + run_cmd ./$i ${srcdir}/bad_begin.nc5 + continue + fi + + run_cmd ./$i -q -o ${TESTOUTDIR}/${exe_name}.nc + + done # safe_modes + +done # check_PROGRAMS diff --git a/test/cdf_format/seq_runs.sh b/test/cdf_format/seq_runs.sh index 732f08d2d3..ab5a463b50 100755 --- a/test/cdf_format/seq_runs.sh +++ b/test/cdf_format/seq_runs.sh @@ -1,32 +1,57 @@ -#!/bin/sh +#!/bin/bash # -# Copyright (C) 2003, Northwestern University and Argonne National Laboratory +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. # # Exit immediately if a command exits with a non-zero status. set -e +DRY_RUN=no +VERBOSE=no + +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $TESTSEQRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $TESTSEQRUN $@ + fi +} + +if test "x${PNETCDF_DEBUG}" = x1 ; then + safe_modes="0 1" +else + safe_modes="0" +fi + +exe_name=`basename $1` + # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS -${TESTSEQRUN} ./test_inq_format ${srcdir} +for j in ${safe_modes} ; do -# the followings check files with corrupted header -${TESTSEQRUN} ./tst_open_cdf5 ${srcdir}/bad_begin.nc5 -${TESTSEQRUN} ./tst_corrupt ${srcdir} + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi -# echo "" + if test "x$exe_name" = xtest_inq_format ; then + run_cmd ./$1 -q -i ${srcdir} + continue + elif test "x$exe_name" = xtst_open_cdf5 ; then + # check files with corrupted header + run_cmd ./$1 ${srcdir}/bad_begin.nc5 + continue + elif test "x$exe_name" = xtst_corrupt ; then + # check files with corrupted header + run_cmd ./$1 ${srcdir} + continue + fi -if test "x${ENABLE_BURST_BUFFER}" = x1 ; then - echo "" - echo "---- testing burst buffering" + run_cmd ./$1 -q -o ${TESTOUTDIR}/${exe_name}.nc - export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" +done # safe_modes - ${TESTSEQRUN} ./test_inq_format ${srcdir} - - # the followings check files with corrupted header - ${TESTSEQRUN} ./tst_open_cdf5 ${srcdir}/bad_begin.nc5 - ${TESTSEQRUN} ./tst_corrupt ${srcdir} -fi diff --git a/test/cdf_format/test_cdf1.nc b/test/cdf_format/test_cdf.nc1 similarity index 100% rename from test/cdf_format/test_cdf1.nc rename to test/cdf_format/test_cdf.nc1 diff --git a/test/cdf_format/test_cdf2.nc b/test/cdf_format/test_cdf.nc2 similarity index 100% rename from test/cdf_format/test_cdf2.nc rename to test/cdf_format/test_cdf.nc2 diff --git a/test/cdf_format/test_netcdf4.nc b/test/cdf_format/test_cdf.nc3 similarity index 100% rename from test/cdf_format/test_netcdf4.nc rename to test/cdf_format/test_cdf.nc3 diff --git a/test/cdf_format/test_cdf.nc4 b/test/cdf_format/test_cdf.nc4 new file mode 100644 index 0000000000..c3a19dba08 Binary files /dev/null and b/test/cdf_format/test_cdf.nc4 differ diff --git a/test/cdf_format/test_cdf5.nc b/test/cdf_format/test_cdf.nc5 similarity index 100% rename from test/cdf_format/test_cdf5.nc rename to test/cdf_format/test_cdf.nc5 diff --git a/test/cdf_format/test_inq_format.c b/test/cdf_format/test_inq_format.c index e6ec67adf5..091b57cd3a 100644 --- a/test/cdf_format/test_inq_format.c +++ b/test/cdf_format/test_inq_format.c @@ -10,151 +10,90 @@ #include #include #include /* basename() */ +#include /* getopt() */ + #include #include #include -int main(int argc, char **argv) { - char dir_name[256], filename[512]; - int err, rank, nerrs=0, format, ncid; - - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); +static +int test_io(const char *out_path, /* ignored */ + const char *in_path, + int format, + int coll_io, + MPI_Info info) +{ + char filename[512]; + int err, nerrs=0, fmt, ncid; - if (argc > 2) { - if (!rank) printf("Usage: %s dir_name\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(dir_name, 256, "%s", argv[1]); - else strcpy(dir_name, "."); - MPI_Bcast(dir_name, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for inquiring file formats ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); - free(cmd_str); - } + sprintf(filename,"%s/test_cdf.nc%d",in_path, format); + // printf("%s at %d: input filename %s\n",basename(__FILE__),__LINE__,filename); - /* test CDF-1 -----------------------------------------------------------*/ - sprintf(filename,"%s/test_cdf1.nc",dir_name); err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, - &ncid); CHECK_ERR - - /* test NULL argument */ - err = ncmpi_inq_format(ncid, NULL); CHECK_ERR - - err = ncmpi_inq_format(ncid, &format); CHECK_ERR - if (format != NC_FORMAT_CLASSIC) { - printf("Error at line %d in %s: expecting CDF-1 format for file %s but got %d\n", - __LINE__,__FILE__,filename,format); - nerrs++; + &ncid); + if (format == NC_FORMAT_NETCDF4 && PNETCDF_DRIVER_NETCDF4 == 0) { + EXP_ERR(NC_ENOTBUILT) + return 0; } - err = ncmpi_close(ncid); CHECK_ERR - - /* test NULL argument */ - err = ncmpi_inq_file_format(filename, NULL); CHECK_ERR - - err = ncmpi_inq_file_format(filename, &format); CHECK_ERR - if (format != NC_FORMAT_CLASSIC) { - printf("Error at line %d in %s: expecting CDF-1 format for file %s but got %d\n", - __LINE__,__FILE__,filename,format); - nerrs++; + else { + CHECK_ERR + if (err != NC_NOERR) { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + printf("Error in %s at %d: rank %d failed to open file %s\n", + basename(__FILE__), __LINE__, rank, filename); + return 1; + } } - /* test CDF-2 -----------------------------------------------------------*/ - sprintf(filename,"%s/test_cdf2.nc",dir_name); - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, - &ncid); CHECK_ERR - /* test NULL argument */ err = ncmpi_inq_format(ncid, NULL); CHECK_ERR - err = ncmpi_inq_format(ncid, &format); CHECK_ERR - if (format != NC_FORMAT_CDF2) { - printf("Error at line %d in %s: expecting CDF-2 format for file %s but got %d\n", - __LINE__,__FILE__,filename,format); - nerrs++; - } - err = ncmpi_close(ncid); CHECK_ERR + err = ncmpi_inq_format(ncid, &fmt); CHECK_ERR - err = ncmpi_inq_file_format(filename, &format); CHECK_ERR - if (format != NC_FORMAT_CDF2) { - printf("Error at line %d in %s: expecting CDF-2 format for file %s but got %d\n", - __LINE__,__FILE__,filename,format); + if (fmt != format) { + printf("Error in %s at %d: expect CDF-%d format for file %s but got %d\n", + __FILE__,__LINE__,format,filename,fmt); nerrs++; } - - /* test CDF-5 -----------------------------------------------------------*/ - sprintf(filename,"%s/test_cdf5.nc",dir_name); - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, - &ncid); CHECK_ERR + err = ncmpi_close(ncid); CHECK_ERR /* test NULL argument */ - err = ncmpi_inq_format(ncid, NULL); CHECK_ERR + err = ncmpi_inq_file_format(filename, NULL); CHECK_ERR - err = ncmpi_inq_format(ncid, &format); CHECK_ERR - if (format != NC_FORMAT_CDF5) { - printf("Error at line %d in %s: expecting CDF-5 format for file %s but got %d\n", - __LINE__,__FILE__,filename,format); - nerrs++; - } - err = ncmpi_close(ncid); CHECK_ERR + err = ncmpi_inq_file_format(filename, &fmt); CHECK_ERR - err = ncmpi_inq_file_format(filename, &format); CHECK_ERR - if (format != NC_FORMAT_CDF5) { - printf("Error at line %d in %s: expecting CDF-5 format for file %s but got %d\n", - __LINE__,__FILE__,filename,format); + if (fmt != format) { + printf("Error in %s at %d: expect CDF-%d format for file %s but got %d\n", + __FILE__,__LINE__,format,filename,fmt); nerrs++; } - /* test NetCDF4 --------------------------------------------------------*/ - sprintf(filename,"%s/test_netcdf4.nc",dir_name); - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid); - if (PNETCDF_DRIVER_NETCDF4 == 0) - EXP_ERR(NC_ENOTBUILT) - else { /* NetCDF-4 is enabled */ - CHECK_ERR - - /* test NULL argument */ - err = ncmpi_inq_format(ncid, NULL); CHECK_ERR + return nerrs; +} - err = ncmpi_inq_format(ncid, &format); CHECK_ERR - if (format != NC_FORMAT_NETCDF4) { - printf("Error at line %d in %s: expecting NetCDF-4 format for file %s but got %d\n", - __LINE__,__FILE__,filename,format); - nerrs++; - } - err = ncmpi_close(ncid); CHECK_ERR +int main(int argc, char **argv) { - /* test NULL argument */ - err = ncmpi_inq_file_format(filename, NULL); CHECK_ERR + int err; + loop_opts opt; - err = ncmpi_inq_file_format(filename, &format); CHECK_ERR - if (format != NC_FORMAT_NETCDF4) { - printf("Error at line %d in %s: expecting NETCDF4 format for file %s but got %d\n", - __LINE__,__FILE__,filename,format); - nerrs++; - } - } + MPI_Init(&argc, &argv); - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 0; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 0; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + err = tst_main(argc, argv, "inquiring file formats", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + + return err; } + diff --git a/test/cdf_format/tst_corrupt.c b/test/cdf_format/tst_corrupt.c index ce13d916de..62744a0fb4 100644 --- a/test/cdf_format/tst_corrupt.c +++ b/test/cdf_format/tst_corrupt.c @@ -63,8 +63,12 @@ int main(int argc, char** argv) { char filename[1024], dirname[512]; int i, rank, err, ncid, nerrs=0; + double timing; MPI_Init(&argc, &argv); + + timing = MPI_Wtime(); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (argc != 2) { @@ -78,9 +82,9 @@ int main(int argc, char** argv) { if (rank == 0) { char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); sprintf(cmd_str, - "*** TESTING C %s for checking corrupted file header ", + "*** TESTING C %s - check corrupted file header", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); + printf("%-63s -- ", cmd_str); fflush(stdout); free(cmd_str); } @@ -165,10 +169,12 @@ int main(int argc, char** argv) { if (malloc_size > 0) ncmpi_inq_malloc_list(); } + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf(PASS_STR, timing); } #endif diff --git a/test/cdf_format/tst_open_cdf5.c b/test/cdf_format/tst_open_cdf5.c index ce02a5b797..eca3eaa086 100644 --- a/test/cdf_format/tst_open_cdf5.c +++ b/test/cdf_format/tst_open_cdf5.c @@ -54,14 +54,18 @@ int main(int argc, char** argv) { char filename[256]; int nerrs=0, rank, nprocs, err, ncid; + double timing; MPI_Init(&argc, &argv); + + timing = MPI_Wtime(); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); if (argc > 2) { if (!rank) printf("Usage: %s [filename]\n",argv[0]); - goto fn_exit; + goto err_out; } if (argc == 2) snprintf(filename, 256, "%s", argv[1]); else strcpy(filename, FILE_NAME); @@ -69,9 +73,9 @@ int main(int argc, char** argv) { if (rank == 0) { char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); sprintf(cmd_str, - "*** TESTING C %s for checking begins in corrupted header", + "*** TESTING C %s - begins in corrupted header", basename(argv[0])); - printf("%-66s --- ", cmd_str); fflush(stdout); + printf("%-63s -- ", cmd_str); fflush(stdout); free(cmd_str); } @@ -93,11 +97,13 @@ int main(int argc, char** argv) { if (malloc_size > 0) ncmpi_inq_malloc_list(); } -fn_exit: +err_out: + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf(PASS_STR, timing); } MPI_Finalize(); diff --git a/test/cdf_format/wrap_runs.sh b/test/cdf_format/wrap_runs.sh index c749c7dbe7..000cdc3a96 100755 --- a/test/cdf_format/wrap_runs.sh +++ b/test/cdf_format/wrap_runs.sh @@ -16,7 +16,7 @@ outfile=`basename $1` OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -26,8 +26,30 @@ fi unset PNETCDF_HINTS for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi +for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + else + USEMPIO_HINTS="nc_pncio=enable" + fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.nc ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.nc @@ -35,7 +57,7 @@ for j in ${safe_modes} ; do echo "" echo "---- testing burst buffering" - export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" + export PNETCDF_HINTS="$PNETCDF_HINTS;nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.bb.nc unset PNETCDF_HINTS ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.bb.nc @@ -48,5 +70,6 @@ for j in ${safe_modes} ; do fi fi done +done rm -f ${OUTDIR}/$outfile.nc rm -f ${OUTDIR}/$outfile.bb.nc diff --git a/test/cdf_format/xfail_runs.sh b/test/cdf_format/xfail_runs.sh index 7fddf05284..aaee1d3379 100755 --- a/test/cdf_format/xfail_runs.sh +++ b/test/cdf_format/xfail_runs.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # # Copyright (C) 2003, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. diff --git a/test/cdl/Makefile.am b/test/cdl/Makefile.am index 493d5ccd21..3263a09498 100644 --- a/test/cdl/Makefile.am +++ b/test/cdl/Makefile.am @@ -11,17 +11,17 @@ AM_DEFAULT_SOURCE_EXT = .c AM_CPPFLAGS = -I$(top_srcdir)/src/include AM_CPPFLAGS += -I$(srcdir)/../common AM_CPPFLAGS += -I$(top_builddir)/src/include +AM_CPPFLAGS += -I$(top_srcdir)/src/utils/ncmpidiff LDADD = $(top_builddir)/src/libs/libpnetcdf.la ../common/libtestutils.la +LDADD += $(top_builddir)/src/utils/ncmpidiff/libncmpidiff_core.la LDADD += @NETCDF4_LDFLAGS@ @ADIOS_LDFLAGS@ @NETCDF4_LIBS@ @ADIOS_LIBS@ if DECL_MPI_OFFSET AM_CPPFLAGS += -DHAVE_DECL_MPI_OFFSET endif -TESTPROGRAMS = tst_cdl_hdr_parser - -check_PROGRAMS = $(TESTPROGRAMS) +check_PROGRAMS = tst_cdl_hdr_parser # autimake 1.11.3 has not yet implemented AM_TESTS_ENVIRONMENT # For newer versions, we can use AM_TESTS_ENVIRONMENT instead @@ -30,26 +30,35 @@ TESTS_ENVIRONMENT += export srcdir="$(srcdir)"; TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; TESTS_ENVIRONMENT += export TESTMPIRUN="$(TESTMPIRUN)"; -TESTS_ENVIRONMENT += export PNETCDF_DEBUG="$(PNETCDF_DEBUG)"; -TESTS_ENVIRONMENT += export TESTPROGRAMS="$(TESTPROGRAMS)"; TESTS_ENVIRONMENT += export check_PROGRAMS="$(check_PROGRAMS)"; -TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER="$(ENABLE_BURST_BUFFER)"; -TESTS_ENVIRONMENT += export ENABLE_NETCDF4="$(ENABLE_NETCDF4)"; -TESTS = $(TESTPROGRAMS) +if PNETCDF_DEBUG + TESTS_ENVIRONMENT += export PNETCDF_DEBUG=1; +endif +if ENABLE_BURST_BUFFER + TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER=1; +endif +if ENABLE_THREAD_SAFE + TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE=1; +endif +if MIMIC_LUSTRE + TESTS_ENVIRONMENT += export MIMIC_LUSTRE=1; +endif + +TESTS = $(check_PROGRAMS) TEST_EXTENSIONS = .sh -LOG_COMPILER = $(srcdir)/wrap_runs.sh +LOG_COMPILER = $(srcdir)/seq_runs.sh SH_LOG_COMPILER = -EXTRA_DIST = wrap_runs.sh parallel_run.sh cdl_header.txt +EXTRA_DIST = seq_runs.sh parallel_run.sh cdl_header.txt -CLEANFILES = $(TESTOUTDIR)/tst_cdl_hdr_parser.nc \ +CLEANFILES = $(TESTOUTDIR)/*.nc $(TESTOUTDIR)/*bb \ core core.* *.gcda *.gcno *.gcov gmon.out ../common/libtestutils.la: set -e; cd ../common && $(MAKE) $(MFLAGS) tests -ptest ptests ptest4: $(TESTPROGRAMS) +ptest ptests ptest4: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 4 MPI processes" @echo "===========================================================" diff --git a/test/cdl/parallel_run.sh b/test/cdl/parallel_run.sh index 4920c61be5..5fcc26b4f6 100755 --- a/test/cdl/parallel_run.sh +++ b/test/cdl/parallel_run.sh @@ -1,24 +1,30 @@ #!/bin/bash # -# Copyright (C) 2025, Northwestern University and Argonne National Laboratory +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. # # Exit immediately if a command exits with a non-zero status. set -e -VALIDATOR=../../src/utils/ncvalidator/ncvalidator -NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff +DRY_RUN=no +VERBOSE=no -# remove file system type prefix if there is any -OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $MPIRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $MPIRUN $@ + fi +} MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "MPIRUN = ${MPIRUN}" # echo "check_PROGRAMS=${check_PROGRAMS}" -# echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -27,24 +33,29 @@ fi # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS +if test "x$MIMIC_LUSTRE" != x1 ; then + PNETCDF_HINTS="cb_nodes=2" +fi + for i in ${check_PROGRAMS} ; do - CMD_OPTS=${TESTOUTDIR}/$i.nc - if test $i = "tst_cdl_hdr_parser" ; then - CMD_OPTS="-q -o ${TESTOUTDIR}/$i.nc ${srcdir}/cdl_header.txt" - fi + exe_name=`basename $i` + + for j in ${safe_modes} ; do + + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi + + if test "x$i" = "xtst_cdl_hdr_parser" ; then + run_cmd ./$i -q -o ${TESTOUTDIR}/${exe_name}.nc -i ${srcdir}/cdl_header.txt + continue + fi + + run_cmd ./$i -q -o ${TESTOUTDIR}/${exe_name}.nc + + done # safe_modes - for j in ${safe_modes} ; do - if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" - else - export PNETCDF_HINTS= - fi - export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" - ${MPIRUN} ./$i ${CMD_OPTS} - - done - rm -f ${OUTDIR}/$i.nc -done +done # check_PROGRAMS diff --git a/test/cdl/seq_runs.sh b/test/cdl/seq_runs.sh new file mode 100755 index 0000000000..830ee4b4e1 --- /dev/null +++ b/test/cdl/seq_runs.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory +# See COPYRIGHT notice in top-level directory. +# + +# Exit immediately if a command exits with a non-zero status. +set -e + +DRY_RUN=no +VERBOSE=no + +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $TESTSEQRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $TESTSEQRUN $@ + fi +} + +if test "x${PNETCDF_DEBUG}" = x1 ; then + safe_modes="0 1" +else + safe_modes="0" +fi + +exe_name=`basename $1` + +# prevent user environment setting of PNETCDF_HINTS to interfere +unset PNETCDF_HINTS + +for j in ${safe_modes} ; do + + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi + + if test "x$1" = "x./tst_cdl_hdr_parser" ; then + run_cmd ./$1 -q -o ${TESTOUTDIR}/${exe_name}.nc -i ${srcdir}/cdl_header.txt + continue + fi + + run_cmd ./$1 -q -o ${TESTOUTDIR}/${exe_name}.nc + +done # safe_modes + diff --git a/test/cdl/tst_cdl_hdr_parser.c b/test/cdl/tst_cdl_hdr_parser.c index e2ef5d2c25..9229be7e6f 100644 --- a/test/cdl/tst_cdl_hdr_parser.c +++ b/test/cdl/tst_cdl_hdr_parser.c @@ -27,81 +27,40 @@ #include -/*----< usage() >------------------------------------------------------------*/ -static void usage (char *argv0) { - char *help = "Usage: %s [OPTION] FILE\n\ - [-h] Print this help message\n\ - [-v] Verbose mode\n\ - [-o path] Output netCDF file path\n\ - FILE: Input CDL file path\n"; - fprintf (stderr, help, argv0); -} - -int main(int argc, char **argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - extern int optind; - char *outfile, *infile; - int i, j, rank, err=0, nerrs=0, hid, verbose; + char *name; + int i, j, rank, err=0, nerrs=0, hid, ncid, verbose; + int ndims, *dimids, nvars, nattrs; + void *value; + nc_type xtype; + MPI_Offset size, nelems; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); - verbose = 1; - infile = NULL; - outfile = NULL; - - /* get command-line arguments */ - while ((i = getopt(argc, argv, "hqo:")) != EOF) - switch(i) { - case 'q': verbose = 0; - break; - case 'o': outfile = strdup(optarg); - break; - case 'h': - default: if (rank==0) usage(argv[0]); - MPI_Finalize(); - return 1; - } - if (outfile == NULL) outfile = strdup("testfile.nc"); - - if (argv[optind] == NULL) { - if (rank == 0) usage(argv[0]); - MPI_Finalize(); - return 1; - } - - infile = strdup(argv[optind]); + verbose = 0; - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for CDL header parser ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - - err = cdl_hdr_open(infile, &hid); + err = cdl_hdr_open(in_path, &hid); if (err != NC_NOERR) exit(1); if (verbose) printf("==================================================\n"); /* create a new netcdf file */ - int ncid, cmode, format; - err = cdl_hdr_inq_format(hid, &format); CHECK_ERR if (verbose) printf("CDF file format: CDF-%d\n", format); - cmode = NC_CLOBBER; - if (format == 2) cmode |= NC_64BIT_OFFSET; - else if (format == 5) cmode |= NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, outfile, cmode, MPI_INFO_NULL, &ncid); + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); CHECK_ERR - char *name; - int ndims, *dimids, nvars, nattrs; - void *value; - nc_type xtype; - MPI_Offset size, nelems; + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR /* define dimensions */ err = cdl_hdr_inq_ndims(hid, &ndims); CHECK_ERR @@ -173,27 +132,31 @@ int main(int argc, char **argv) err = cdl_hdr_close(hid); CHECK_ERR - if (infile != NULL) free(infile); - if (outfile != NULL) free(outfile); - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 1; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "CDL header parser", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/cdl/wrap_runs.sh b/test/cdl/wrap_runs.sh index 98de428706..d70a72da40 100755 --- a/test/cdl/wrap_runs.sh +++ b/test/cdl/wrap_runs.sh @@ -21,7 +21,7 @@ fi OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" diff --git a/test/common/Makefile.am b/test/common/Makefile.am index fc3abb0ea4..858eb1acb6 100644 --- a/test/common/Makefile.am +++ b/test/common/Makefile.am @@ -11,9 +11,14 @@ SUFFIXES = .a .o .c .F90 .h AM_DEFAULT_SOURCE_EXT = .c AM_CPPFLAGS = -I$(top_srcdir)/src/include AM_CPPFLAGS += -I$(top_builddir)/src/include +AM_CPPFLAGS += -I$(top_srcdir)/src/utils/ncmpidiff + +LDADD = @NETCDF4_LDFLAGS@ @ADIOS_LDFLAGS@ @NETCDF4_LIBS@ @ADIOS_LIBS@ -lm check_LTLIBRARIES = libtestutils.la +libtestutils_la_LIBADD = $(top_builddir)/src/utils/ncmpidiff/libncmpidiff_core.la + libtestutils_la_SOURCES = testutils.c testutils.h CLEANFILES = core.* *.gcda *.gcno *.gcov gmon.out @@ -23,13 +28,19 @@ CLEANFILES = core.* *.gcda *.gcno *.gcov gmon.out if HAS_FORTRAN check_LTLIBRARIES += libtestutilsf.la libtestutilsf_la_SOURCES = testutilsf.F90 -libtestutils_la_LIBADD = libtestutilsf.la +libtestutils_la_LIBADD += libtestutilsf.la + CLEANFILES += testutilsf.mod endif AM_FFLAGS = AM_FCFLAGS = +if RELAX_COORD_BOUND + AM_FFLAGS += $(FC_DEFINE)RELAX_COORD_BOUND + AM_FCFLAGS += $(FC_DEFINE)RELAX_COORD_BOUND +endif + if DECL_GET_ENVIRONMENT_VARIABLE AM_FCFLAGS += $(FC_DEFINE)HAS_GET_ENVIRONMENT_VARIABLE endif diff --git a/test/common/testutils.c b/test/common/testutils.c index 3e37aa84bd..ad9f442caf 100644 --- a/test/common/testutils.c +++ b/test/common/testutils.c @@ -6,12 +6,25 @@ #include +#include /* basename() */ #include #include /* strchr(), strerror(), strdup(), strcpy(), strlen() */ +#include /* getopt(), stat() */ +#include /* stat() */ +#include /* stat() */ + #include #include #include "testutils.h" +#include + +#ifdef ENABLE_NETCDF4 +int nc_formats[5] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_NETCDF4, + NC_FORMAT_NETCDF4_CLASSIC, NC_FORMAT_64BIT_DATA}; +#else +int nc_formats[3] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; +#endif char* nc_err_code_name(int err) { @@ -285,3 +298,351 @@ char* remove_file_system_type_prefix(const char *filename) return ret_filename; } +int is_relax_coord_bound(void) +{ + char *env_str; + int relax_coord_bound; + +#ifdef RELAX_COORD_BOUND + relax_coord_bound = 1; +#else + relax_coord_bound = 0; +#endif + if ((env_str = getenv("PNETCDF_RELAX_COORD_BOUND")) != NULL) { + /* the env variable is set */ + if (*env_str == '0') relax_coord_bound = 0; + else relax_coord_bound = 1; + } + + return relax_coord_bound; +} + +void +static tst_main_usage(char *argv0) +{ + char *base_name = basename(argv0); + char *help = + "Usage: %s [OPTIONS]...[filename]\n" + " [-h] Print help\n" + " [-q] quiet mode\n" + " [-k] Keep output files (default: no)\n" + " [-i in_path]: input file path (default: NULL)\n" + " [-o out_path]: output netCDF file name (default: %s.nc)\n"; + fprintf(stderr, help, base_name, base_name); +} + +int tst_main(int argc, + char **argv, + char *msg, /* short description about the test */ + loop_opts opt, /* test options */ + int (*tst_body)(const char*,const char*,int,int,MPI_Info)) +{ + extern int optind; + extern char *optarg; + char *in_path=NULL, *out_path=NULL; + + /* IDs for the netCDF file, dimensions, and variables. */ + int nprocs, rank, err, nerrs=0, keep_files, quiet, coll_io; + int i, a, d, r, m, b; + int num_ina, num_drv, num_ind, num_chk, num_bb, num_mod; + + MPI_Info info=MPI_INFO_NULL; + double timing = MPI_Wtime(); + +#ifdef PROFILING + double itiming[256]; int k=0; +#endif + + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + keep_files = 0; + quiet = 0; + + while ((i = getopt(argc, argv, "hqki:o:")) != EOF) + switch(i) { + case 'q': + quiet = 1; + break; + case 'k': + keep_files = 1; + break; + case 'i': + in_path = strdup(optarg); + break; + case 'o': + out_path = strdup(optarg); + break; + case 'h': + default: if (rank==0) tst_main_usage(argv[0]); + MPI_Finalize(); + return 1; + } + + if (out_path == NULL) + out_path = strdup("testfile.nc"); +#if 0 + else { + /* check if filename is a directory */ + struct stat sb; + + snprintf(filename, 256, "%s", argv[optind]); + if (stat(filename, &sb) == 0 && S_ISDIR(sb.st_mode)) + append_suffix = 0; + } +#endif + + if (rank == 0) { + char *cmd_str = (char *)malloc(strlen(argv[0]) + 256); + sprintf(cmd_str, "*** TESTING C %s - %s", basename(argv[0]), msg); + printf("%-63s -- ", cmd_str); + free(cmd_str); + } + + char cmd_opts[64]; + sprintf(cmd_opts, "Rank %d: ncmpidiff", rank); + + char *ptr = strrchr(out_path, '.'); + if (ptr != NULL) *ptr = '\0'; + + MPI_Info_create(&info); + + num_ina = (opt.ina) ? 2 : 1; + num_drv = (opt.drv) ? 2 : 1; + num_ind = (opt.ind) ? 2 : 1; + num_chk = (opt.chk) ? 2 : 1; + num_bb = (opt.bb) ? 2 : 1; + num_mod = (opt.mod) ? 2 : 1; + + for (i=0; i 0) + sprintf(ext, "nc%d", opt.formats[i]); + else /* for tests not testing CDF versions */ + strcpy(ext, "nc"); + + for (a=0; a 0) + printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", + sum_size); + if (malloc_size > 0) ncmpi_inq_malloc_list(); + } + +err_out: + if (in_path != NULL) free(in_path); + if (out_path != NULL) free(out_path); + + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); + MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); + +#ifdef PROFILING + MPI_Allreduce(MPI_IN_PLACE, itiming, 256, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + if (rank == 0) { + for (i=0; i 0); +} + +/*----< pnc_fmt_string() >---------------------------------------------------*/ +char* pnc_fmt_string(int format) +{ + switch(format) { + case NC_FORMAT_CLASSIC: return "NC_FORMAT_CLASSIC"; + case NC_FORMAT_64BIT_OFFSET: return "NC_FORMAT_64BIT_OFFSET"; + case NC_FORMAT_NETCDF4: return "NC_FORMAT_NETCDF4"; + case NC_FORMAT_NETCDF4_CLASSIC: return "NC_FORMAT_NETCDF4_CLASSIC"; + case NC_FORMAT_64BIT_DATA: return "NC_FORMAT_64BIT_DATA"; + default: return "UNKNOWN"; + } +} diff --git a/test/common/testutils.h b/test/common/testutils.h index ef3d2593f8..0ab416d265 100644 --- a/test/common/testutils.h +++ b/test/common/testutils.h @@ -15,15 +15,16 @@ #include #include #include +#include #define MODE_COLL 1 #define MODE_INDEP 0 #define CHECK_ERR { \ if (err != NC_NOERR) { \ - nerrs++; \ printf("Error at line %d in %s: (%s)\n", \ __LINE__,__FILE__,ncmpi_strerrno(err)); \ + assert(0); \ } \ } @@ -34,7 +35,7 @@ __LINE__,__FILE__,ncmpi_strerrno(err)); \ } \ MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); \ - if (nerrs > 0) goto fn_exit; \ + if (nerrs > 0) goto err_out; \ } #define CHECK_ERROUT { \ @@ -51,7 +52,7 @@ nerrs++; \ printf("Error at line %d in %s: (%s)\n", \ __LINE__,__FILE__,ncmpi_strerrno(err)); \ - goto fn_exit; \ + assert(0); \ } \ } @@ -70,22 +71,25 @@ __LINE__,__FILE__,ncmpi_strerrno(exp), ncmpi_strerrno(err)); \ } \ MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); \ - if (nerrs > 0) goto fn_exit; \ + if (nerrs > 0) goto err_out; \ } +#define CHECK_NERRS_ALL if (nerrs != 0) assert(0); +/* #define CHECK_NERRS_ALL { \ MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); \ - if (nerrs > 0) goto fn_exit; \ + if (nerrs > 0) assert(0) err_out; \ } +*/ int inq_env_hint(char *hint_key, char **hint_value); #ifdef PNETCDF_DEBUG -#define PASS_STR "\x1b[32mpass\x1b[0m\n" +#define PASS_STR "\x1b[32mpass\x1b[0m (%4.1fs)\n" #define SKIP_STR "\x1b[32mskip\x1b[0m\n" #define FAIL_STR "\x1b[31mfail\x1b[0m with %d mismatches\n" #else -#define PASS_STR "pass\n" +#define PASS_STR "pass (%4.1fs)\n" #define SKIP_STR "skip\n" #define FAIL_STR "fail with %d mismatches\n" #endif @@ -98,6 +102,14 @@ int inq_env_hint(char *hint_key, char **hint_value); printf("MPI Error at file %s line %d (%s)\n",__FILE__,__LINE__,err_string); \ } +#ifdef ENABLE_NETCDF4 +extern int nc_formats[5]; +#else +extern int nc_formats[3]; +#endif + +extern char* pnc_fmt_string(int format); + extern char* nc_err_code_name(int err); #if MPI_VERSION < 3 @@ -114,6 +126,35 @@ extern char *strdup(const char *s); extern int strcasecmp(const char *s1, const char *s2); #endif +extern char* remove_file_system_type_prefix(const char *filename); +extern +int is_relax_coord_bound(void); + +extern +void tst_usage(char *argv0); + +typedef struct { + char *in_path; /* input file path for read tests */ + int num_fmts; /* number of file formats: CDF 1/2/3/4/5 */ + int *formats; /* [num_fmts] max are {NC_FORMAT_CLASSIC, + NC_FORMAT_64BIT_OFFSET, + NC_FORMAT_NETCDF4, + NC_FORMAT_NETCDF4_CLASSIC, + NC_FORMAT_64BIT_DATA}; */ + int ina; /* add test of intra-node aggregation */ + int drv; /* add test of PNCIO driver in addition to MPI-IO */ + int ind; /* add test of hint romio_no_indep_rw */ + int chk; /* add test of hint nc_data_move_chunk_size (100 bytes when set to 1) */ + int bb; /* add test of burst-buffering feature */ + int mod; /* add test of independent data mode */ + int hdr_diff; /* run ncmpidiff for file header only */ + int var_diff; /* run ncmpidiff for variables (disabled automatically when hdr_diff == 0) */ +} loop_opts; + +extern +int tst_main(int argc, char **argv, char *msg, loop_opts opt, + int (*tst_body)(const char*, const char*, int, int, MPI_Info)); + #endif diff --git a/test/common/testutilsf.F90 b/test/common/testutilsf.F90 index ca08e381cf..545ea88a90 100644 --- a/test/common/testutilsf.F90 +++ b/test/common/testutilsf.F90 @@ -6,19 +6,28 @@ ! ! $Id$ + subroutine fusage(cmd) + implicit none + character(len=*) cmd + + print*,'Usage: ',trim(cmd),' [OPTIONS]' + print*,' [-h] Print help' + print*,' [-q] quiet mode' + print*,' [-k] Keep output files (default: no)' + print*,' [-i in_path]: input file path (default: NULL)' + print*,' [-o out_path]: output netCDF file name (default: %s.nc)' + end subroutine fusage + ! This function gets the executable name and output file name from the ! command line. - integer function get_args(cmd, filename) -#ifdef NAGFOR - USE F90_UNIX_ENV, only : iargc, getarg - implicit none -#else + integer function get_args(cmd, out_path, in_path, keep_files) implicit none - integer iargc -#endif - integer argc, i - character(len=*) cmd, filename - character(len=256) full_cmd + character(len=*) cmd, out_path, in_path + character(len=256) :: full_cmd, arg + logical :: keep_files, skip_next + integer :: i, j, n_args + + keep_files = .false. get_args = 1 call getarg(0, full_cmd) @@ -31,20 +40,46 @@ integer function get_args(cmd, filename) cmd(:) = full_cmd(i+1:) endif - argc = IARGC() - if (argc .GT. 1) then - print*,'Usage: ',trim(cmd),' [filename]' - get_args = 0 - return - endif - if (argc .EQ. 1) call getarg(1, filename) + n_args = command_argument_count() + + skip_next = .false. + do j = 1, n_args + if (skip_next) then + skip_next = .false. + cycle + end if + + call get_command_argument(j, arg) + arg = trim(arg) ! Remove trailing spaces + + if (arg == "-k") then + keep_files = .true. + else if (arg == "-i") then + if (j < n_args) then + call get_command_argument(j+1, arg) + in_path = trim(arg) + skip_next = .true. + end if + else if (arg == "-o") then + if (j < n_args) then + call get_command_argument(j+1, arg) + out_path = trim(arg) + skip_next = .true. + end if + else if (arg == "-h") then + call fusage(cmd) + return + end if + end do + end function get_args ! This function prints the pass/fail message on screen - subroutine pass_fail(nerrs, msg) + subroutine pass_fail(nerrs, msg, timing) implicit none integer nerrs character(len=*) msg + double precision timing ! local variables CHARACTER ESC @@ -52,18 +87,18 @@ subroutine pass_fail(nerrs, msg) #ifdef PNETCDF_DEBUG CHARACTER (LEN=20) PASS_STR, FAIL_STR - PARAMETER (PASS_STR='------ '//ESC//'[32mpass'//ESC//'[0m') - PARAMETER (FAIL_STR='------ '//ESC//'[31mfail'//ESC//'[0m') + PARAMETER (PASS_STR='-- '//ESC//'[32mpass'//ESC//'[0m (') + PARAMETER (FAIL_STR='-- '//ESC//'[31mfail'//ESC//'[0m') #else CHARACTER (LEN=11) PASS_STR, FAIL_STR - PARAMETER (PASS_STR='------ pass') - PARAMETER (FAIL_STR='------ fail') + PARAMETER (PASS_STR='-- pass (') + PARAMETER (FAIL_STR='-- fail') #endif if (nerrs .EQ. 0) then - write(*,"(A67,A)") msg, PASS_STR + write(*,"(A64,A,F4.1,A)") msg, trim(PASS_STR), timing, 's)' else - write(*,"(A67,A)") msg, FAIL_STR + write(*,"(A64,A)") msg, trim(FAIL_STR) endif end subroutine pass_fail @@ -76,3 +111,25 @@ subroutine get_env(hint_str, value) #endif end subroutine get_env + LOGICAL FUNCTION relax_coord_bound_f() + character(len=256) :: env_str, env_val + integer :: ierr + +#ifdef RELAX_COORD_BOUND + relax_coord_bound_f = .TRUE. +#else + relax_coord_bound_f = .FALSE. +#endif + env_str = "PNETCDF_RELAX_COORD_BOUND" + call get_environment_variable(env_str, value=env_val, status=ierr) + + if (ierr == 0) THEN + ! Environment variable is set + if (env_val(1:1) == '1') then + relax_coord_bound_f = .TRUE. + else + relax_coord_bound_f = .FALSE. + endif + endif + END FUNCTION relax_coord_bound_f + diff --git a/test/fandc/Makefile.am b/test/fandc/Makefile.am index dc3f81432a..29ad063736 100644 --- a/test/fandc/Makefile.am +++ b/test/fandc/Makefile.am @@ -47,11 +47,11 @@ if DECL_MPI_OFFSET AM_FCFLAGS += $(FC_DEFINE)HAVE_DECL_MPI_OFFSET endif -TESTPROGRAMS = pnctest \ - csnap +check_PROGRAMS = pnctest \ + csnap if HAS_FORTRAN - TESTPROGRAMS += pnf_test pnctestf fixedform + check_PROGRAMS += pnf_test pnctestf fixedform pnf_test_SOURCES = pnf_test.f pnf_test_FFLAGS = $(FFIXEDFORMFLAG) $(AM_FFLAGS) pnctestf_SOURCES = pnctestf.f @@ -59,17 +59,15 @@ if HAS_FORTRAN fixedform_SOURCES = fixedform.f90 fixedform_FCFLAGS = $(FFIXEDFORMFLAG) $(AM_FCFLAGS) $(AM_FFLAGS) if HAVE_F77_SUPPORT_FREEFORM - TESTPROGRAMS += freeform + check_PROGRAMS += freeform freeform_SOURCES = freeform.f freeform_FFLAGS = $(FFREEFORMFLAG) $(AM_FFLAGS) endif endif -check_PROGRAMS = $(TESTPROGRAMS) - # autimake 1.11.3 has not yet implemented AM_TESTS_ENVIRONMENT # For newer versions, we can use AM_TESTS_ENVIRONMENT instead -# AM_TESTS_ENVIRONMENT = TESTPROGRAMS="$(TESTPROGRAMS)" ; export TESTPROGRAMS; +# AM_TESTS_ENVIRONMENT = check_PROGRAMS="$(check_PROGRAMS)" ; export check_PROGRAMS; # AM_TESTS_ENVIRONMENT += TESTSEQRUN="$(TESTSEQRUN)" ; export TESTSEQRUN; # AM_TESTS_ENVIRONMENT += TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)" ; export TESTOUTDIR; TESTS_ENVIRONMENT = export SED="$(SED)"; @@ -77,13 +75,24 @@ TESTS_ENVIRONMENT += export srcdir="$(srcdir)"; TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; TESTS_ENVIRONMENT += export TESTMPIRUN="$(TESTMPIRUN)"; -TESTS_ENVIRONMENT += export PNETCDF_DEBUG="$(PNETCDF_DEBUG)"; -TESTS_ENVIRONMENT += export TESTPROGRAMS="$(TESTPROGRAMS)"; TESTS_ENVIRONMENT += export check_PROGRAMS="$(check_PROGRAMS)"; +if PNETCDF_DEBUG + TESTS_ENVIRONMENT += export PNETCDF_DEBUG=1; +endif +if ENABLE_BURST_BUFFER + TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER=1; +endif +if ENABLE_THREAD_SAFE + TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE=1; +endif +if MIMIC_LUSTRE + TESTS_ENVIRONMENT += export MIMIC_LUSTRE=1; +endif + # programs in this folder are just for testing compile error/warning messages, # not for running -# TESTS = $(TESTPROGRAMS) +# TESTS = $(check_PROGRAMS) EXTRA_DIST = README diff --git a/test/fandc/csnap.c b/test/fandc/csnap.c index 2443086e5e..6528503b7b 100644 --- a/test/fandc/csnap.c +++ b/test/fandc/csnap.c @@ -64,7 +64,7 @@ int pe_coords[3]; /* Cartesian PE coords */ /*** function prototypes ***/ -void find_locnx(MPI_Offset nx, int mype, int totpes, MPI_Offset *locnx, MPI_Offset *xbegin); +void find_locnx(MPI_Offset nx, int rank, int nprocs, MPI_Offset *locnx, MPI_Offset *xbegin); void write_file(char *filename, double *t); void read_file(char *filename, double *t); void get_fields(double *tt, double *smf); @@ -79,8 +79,12 @@ int main(int argc, char *argv[]) { double rates_l[4], rates_g[4]; int i, rank; char filename[256]; + double timing; + + MPI_Init(&argc, &argv); + + timing = MPI_Wtime(); - MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD,&totpes); MPI_Comm_rank(MPI_COMM_WORLD,&rank); @@ -96,11 +100,12 @@ int main(int argc, char *argv[]) { if (rank == 0) { char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); sprintf(cmd_str, "*** TESTING C %s for 3D array write/read ", argv[0]); - printf("%-66s ------ ", cmd_str); fflush(stdout); + printf("%-64s -- ", cmd_str); fflush(stdout); free(cmd_str); } + if (filename[0] == '\0') { - printf(PASS_STR); + printf(FAIL_STR, 1); fprintf(stderr,"Error: invalid filename, Exiting ...\n"); MPI_Finalize(); return 1; @@ -195,10 +200,12 @@ int main(int argc, char *argv[]) { sum_size); } + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf(PASS_STR, timing); } MPI_Finalize(); @@ -390,14 +397,14 @@ void read_file(char *filename, double *t) { } -void find_locnx(MPI_Offset nx, int mype, int totpes, MPI_Offset *locnx, MPI_Offset *xbegin) { +void find_locnx(MPI_Offset nx, int rank, int nprocs, MPI_Offset *locnx, MPI_Offset *xbegin) { MPI_Offset xremain; - *locnx = nx / totpes; - xremain = nx - totpes*(*locnx); - if (mype < xremain) (*locnx)++; - *xbegin = mype*(nx/totpes) + xremain; - if (mype < xremain) *xbegin += mype - xremain; + *locnx = nx / nprocs; + xremain = nx - nprocs*(*locnx); + if (rank < xremain) (*locnx)++; + *xbegin = rank*(nx/nprocs) + xremain; + if (rank < xremain) *xbegin += rank - xremain; } diff --git a/test/fandc/pnctest.c b/test/fandc/pnctest.c index b85bd91703..dfafdc0fbe 100644 --- a/test/fandc/pnctest.c +++ b/test/fandc/pnctest.c @@ -21,8 +21,12 @@ int main (int argc, char *argv[]) zero is specified */ MPI_Offset TOTSIZ_3D[3] = { 10, 20, 30 }; MPI_Comm comm_cart; + double timing; + + MPI_Init(&argc, &argv); + + timing = MPI_Wtime(); - MPI_Init (&argc, &argv); MPI_Comm_size (MPI_COMM_WORLD, &totpes); MPI_Comm_size (MPI_COMM_WORLD, &rank); @@ -57,10 +61,12 @@ int main (int argc, char *argv[]) sum_size); } + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf(PASS_STR, timing); } MPI_Finalize(); diff --git a/test/header/Makefile.am b/test/header/Makefile.am index 8080f6885a..2535a5dcea 100644 --- a/test/header/Makefile.am +++ b/test/header/Makefile.am @@ -23,13 +23,11 @@ if DECL_MPI_OFFSET # AM_FCFLAGS += $(FC_DEFINE)HAVE_DECL_MPI_OFFSET endif -TESTPROGRAMS = header_consistency - -check_PROGRAMS = $(TESTPROGRAMS) +check_PROGRAMS = header_consistency # autimake 1.11.3 has not yet implemented AM_TESTS_ENVIRONMENT # For newer versions, we can use AM_TESTS_ENVIRONMENT instead -# AM_TESTS_ENVIRONMENT = export TESTPROGRAMS="$(TESTPROGRAMS)"; +# AM_TESTS_ENVIRONMENT = export check_PROGRAMS="$(check_PROGRAMS)"; # AM_TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; # AM_TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT = export SED="$(SED)"; @@ -37,20 +35,31 @@ TESTS_ENVIRONMENT += export srcdir="$(srcdir)"; TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; TESTS_ENVIRONMENT += export TESTMPIRUN="$(TESTMPIRUN)"; -TESTS_ENVIRONMENT += export PNETCDF_DEBUG="$(PNETCDF_DEBUG)"; -TESTS_ENVIRONMENT += export TESTPROGRAMS="$(TESTPROGRAMS)"; TESTS_ENVIRONMENT += export check_PROGRAMS="$(check_PROGRAMS)"; -TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER="$(ENABLE_BURST_BUFFER)"; + +if PNETCDF_DEBUG + TESTS_ENVIRONMENT += export PNETCDF_DEBUG=1; +endif +if ENABLE_BURST_BUFFER + TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER=1; +endif +if ENABLE_THREAD_SAFE + TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE=1; +endif +if MIMIC_LUSTRE + TESTS_ENVIRONMENT += export MIMIC_LUSTRE=1; +endif # consistency check should be run on more than one process -# TESTS = seq_runs.sh +TESTS = $(check_PROGRAMS) +TEST_EXTENSIONS = .sh +LOG_COMPILER = $(srcdir)/seq_runs.sh +SH_LOG_COMPILER = EXTRA_DIST = seq_runs.sh parallel_run.sh -NC_FILES = $(TESTPROGRAMS:%=$(TESTOUTDIR)/%.nc) \ - $(TESTPROGRAMS:%=$(TESTOUTDIR)/%.bb.nc) - -CLEANFILES = $(NC_FILES) core core.* *.gcda *.gcno *.gcov gmon.out +CLEANFILES = $(TESTOUTDIR)/*.nc $(TESTOUTDIR)/*bb \ + core core.* *.gcda *.gcno *.gcov gmon.out ../common/libtestutils.la: set -e; cd ../common && $(MAKE) $(MFLAGS) tests diff --git a/test/header/header_consistency.c b/test/header/header_consistency.c index 62b05923b5..272aac6a14 100644 --- a/test/header/header_consistency.c +++ b/test/header/header_consistency.c @@ -13,7 +13,10 @@ #include #include #include /* basename() */ +#include + #include + #include #include @@ -44,11 +47,9 @@ /*----< test_open_mode() >----------------------------------------------------*/ static -int test_open_mode(char *filename, int safe_mode) +int test_open_mode(const char *filename, MPI_Comm comm, int safe_mode, MPI_Info info) { int err, rank, ncid, cmode, omode, nerrs=0; - MPI_Info info=MPI_INFO_NULL; - MPI_Comm comm=MPI_COMM_WORLD; MPI_Comm_rank(comm, &rank); @@ -84,11 +85,9 @@ int test_open_mode(char *filename, int safe_mode) /*----< test_dim() >----------------------------------------------------------*/ static -int test_dim(char *filename, int safe_mode) +int test_dim(const char *filename, MPI_Comm comm, int safe_mode, MPI_Info info) { int err, rank, ncid, cmode, dimid1, dimid2, dimid3, nerrs=0; - MPI_Info info=MPI_INFO_NULL; - MPI_Comm comm=MPI_COMM_WORLD; MPI_Comm_rank(comm, &rank); cmode = NC_CLOBBER|NC_64BIT_OFFSET; @@ -129,19 +128,18 @@ int test_dim(char *filename, int safe_mode) CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR + return nerrs; } /*----< test_attr() >---------------------------------------------------------*/ static -int test_attr(char *filename, int safe_mode) +int test_attr(const char *filename, MPI_Comm comm, int safe_mode, MPI_Info info) { int err, rank, ncid, cmode, nerrs=0; char gattr[128]; int int_attr; float flt_attr; - MPI_Info info=MPI_INFO_NULL; - MPI_Comm comm=MPI_COMM_WORLD; MPI_Comm_rank(comm, &rank); cmode = NC_CLOBBER|NC_64BIT_OFFSET; @@ -197,14 +195,12 @@ int test_attr(char *filename, int safe_mode) /*----< test_var() >----------------------------------------------------------*/ static -int test_var(char *filename, int safe_mode) +int test_var(const char *filename, MPI_Comm comm, int safe_mode, MPI_Info info) { int err, rank, ncid, cmode, nerrs=0; int dimid[3], varid1, int_attr; float flt_attr; char name[128], var_attr[128]; - MPI_Info info=MPI_INFO_NULL; - MPI_Comm comm=MPI_COMM_WORLD; MPI_Comm_rank(comm, &rank); cmode = NC_CLOBBER|NC_64BIT_OFFSET; @@ -343,35 +339,41 @@ int test_var(char *filename, int safe_mode) return nerrs; } -/*----< main() >--------------------------------------------------------------*/ -int main(int argc, char **argv) +/*----< test_io() >----------------------------------------------------------*/ +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, /* ignored */ + int coll_io, /* ignored */ + MPI_Info info) { - char *filename="testfile.nc", *mode[2] = {"0", "1"}; + char *mode[2] = {"0", "1"}; int i, rank, nprocs, verbose, nerrs=0; + MPI_Comm comm=MPI_COMM_NULL; - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (nprocs < 2) { - if (!rank) printf("This program is for running 2 or more processes. Exiting ...\n"); - MPI_Finalize(); - return 1; - } + if (nprocs == 1) return 0; - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) filename = argv[1]; + assert(in_path == NULL); + + /* This program is designed to run on 2 MPI processes */ + if (nprocs > 2) { + /* Make MPI calls to create a new communicator. */ + int new_ranks[2]={0,1}; + MPI_Group origin_group, group; - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for header consistency", basename(argv[0])); - printf("%-66s ------ ", cmd_str); - free(cmd_str); + MPI_Comm_group(MPI_COMM_WORLD, &origin_group); + MPI_Group_incl(origin_group, 2, new_ranks, &group); + MPI_Comm_create(MPI_COMM_WORLD, group, &comm); + MPI_Group_free(&group); + MPI_Group_free(&origin_group); } + else + comm = MPI_COMM_WORLD; + + if (rank >= 2) goto err_out; verbose = 1; for (i=verbose; i>=0; i--) { @@ -381,32 +383,52 @@ int main(int argc, char **argv) * PNETCDF_SAFE_MODE to 0. */ setenv("PNETCDF_SAFE_MODE", mode[i], 1); - nerrs += test_open_mode(filename, i); - nerrs += test_dim(filename, i); + nerrs = test_open_mode(out_path, comm, i, info); + if (nerrs > 0) goto err_out; - nerrs += test_attr(filename, i); + nerrs = test_dim(out_path, comm, i, info); + if (nerrs > 0) goto err_out; - nerrs += test_var(filename, i); - } + nerrs = test_attr(out_path, comm, i, info); + if (nerrs > 0) goto err_out; - MPI_Offset malloc_size, sum_size; - int err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); + nerrs = test_var(out_path, comm, i, info); + if (nerrs > 0) goto err_out; } - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +err_out: + if (comm != MPI_COMM_WORLD && comm != MPI_COMM_NULL) + MPI_Comm_free(&comm); + + return nerrs; +} + +int main(int argc, char **argv) { + + /* This test program does not support NetCDF4 option */ + int err, nprocs, formats[] = {0}; + + loop_opts opt; + + MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 0; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = (nprocs > 1); /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "header consistency", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/header/parallel_run.sh b/test/header/parallel_run.sh index a4d0770fa1..da174c4a67 100755 --- a/test/header/parallel_run.sh +++ b/test/header/parallel_run.sh @@ -1,24 +1,30 @@ #!/bin/bash # -# Copyright (C) 2018, Northwestern University and Argonne National Laboratory +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. # # Exit immediately if a command exits with a non-zero status. set -e -VALIDATOR=../../src/utils/ncvalidator/ncvalidator -NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff +DRY_RUN=no +VERBOSE=no -# remove file system type prefix if there is any -OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $MPIRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $MPIRUN $@ + fi +} MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "MPIRUN = ${MPIRUN}" # echo "check_PROGRAMS=${check_PROGRAMS}" -# echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -27,48 +33,24 @@ fi # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS +if test "x$MIMIC_LUSTRE" != x1 ; then + PNETCDF_HINTS="cb_nodes=2" +fi + for i in ${check_PROGRAMS} ; do - for j in ${safe_modes} ; do - for intra_aggr in 0 1 ; do - if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" - else - export PNETCDF_HINTS= - fi - if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" - fi - export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc - # echo "--- validating file ${TESTOUTDIR}/$i.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc - # echo "" + exe_name=`basename $i` + + for j in ${safe_modes} ; do - if test "x${ENABLE_BURST_BUFFER}" = x1 ; then - # echo "---- test burst buffering feature" - saved_PNETCDF_HINTS=${PNETCDF_HINTS} - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.bb.nc - export PNETCDF_HINTS=${saved_PNETCDF_HINTS} + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi - # echo "--- validating file ${TESTOUTDIR}/$i.bb.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.nc + run_cmd ./$i -q -o ${TESTOUTDIR}/${exe_name}.nc - # burst buffering does not support nonblocking requests in define mode - # echo "--- ncmpidiff $i.nc $i.bb.nc ---" - ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.nc ${TESTOUTDIR}/$i.bb.nc - fi + done # safe_modes - if test "x${ENABLE_NETCDF4}" = x1 ; then - # echo "test netCDF-4 feature" - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc4 4 - # Validator does not support nc4 - fi - done - done - rm -f ${OUTDIR}/$i.nc - rm -f ${OUTDIR}/$i.bb.nc -done +done # check_PROGRAMS diff --git a/test/header/seq_runs.sh b/test/header/seq_runs.sh index 475c8da8ac..205d1fb396 100755 --- a/test/header/seq_runs.sh +++ b/test/header/seq_runs.sh @@ -1,44 +1,44 @@ -#!/bin/sh +#!/bin/bash # -# Copyright (C) 2003, Northwestern University and Argonne National Laboratory +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. # # Exit immediately if a command exits with a non-zero status. set -e -VALIDATOR=../../src/utils/ncvalidator/ncvalidator -NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff +DRY_RUN=no +VERBOSE=no -outfile=`basename $1` +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $TESTSEQRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $TESTSEQRUN $@ + fi +} -# echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" fi +exe_name=`basename $1` + # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS -# header consistency tests are designed to run on more than one MPI process for j in ${safe_modes} ; do - export PNETCDF_SAFE_MODE=$j - # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" - ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.nc - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.nc - - - if test "x${ENABLE_BURST_BUFFER}" = x1 ; then - echo "" - echo "---- testing burst buffering" - export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" - ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.bb.nc - unset PNETCDF_HINTS - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.bb.nc - - # echo "--- ncmpidiff $outfile.nc $outfile.bb.nc ---" - ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$outfile.nc ${TESTOUTDIR}/$outfile.bb.nc - fi -done + + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi + + run_cmd ./$1 -q -o ${TESTOUTDIR}/${exe_name}.nc + +done # safe_modes + diff --git a/test/largefile/Makefile.am b/test/largefile/Makefile.am index d74134b761..0ee45cc7b6 100644 --- a/test/largefile/Makefile.am +++ b/test/largefile/Makefile.am @@ -24,46 +24,42 @@ if IS_BIGENDIAN # AM_FCFLAGS += $(FC_DEFINE)WORDS_BIGENDIAN endif -TESTPROGRAMS = large_files \ - large_var \ - large_attr \ - large_dims_vars_attrs \ - high_dim_var \ - tst_cdf5_begin \ - tst_hash_large_ndims \ - tst_hash_large_nvars \ - tst_hash_large_ngattrs \ - large_coalesce \ - large_header \ - large_reqs +check_PROGRAMS = large_files \ + large_var \ + large_attr \ + large_dims_vars_attrs \ + high_dim_var \ + tst_cdf5_begin \ + tst_hash_large_ndims \ + tst_hash_large_nvars \ + tst_hash_large_ngattrs \ + large_coalesce \ + large_header \ + large_reqs if HAS_FORTRAN - TESTPROGRAMS += bigrecords + check_PROGRAMS += bigrecords bigrecords_SOURCES = bigrecords.f AM_FFLAGS = -I$(top_builddir)/src/binding/f77 $(FFIXEDFORMFLAG) if HAVE_MPI_MOD - TESTPROGRAMS += tst_flarge + check_PROGRAMS += tst_flarge tst_flarge_SOURCES = tst_flarge.f90 AM_FCFLAGS = $(FC_MODINC)$(top_builddir)/src/binding/f90 \ $(FC_MODINC)../common $(FFREEFORMFLAG) endif endif -EXTRA_DIST = wrap_runs.sh parallel_run.sh +EXTRA_DIST = seq_runs.sh parallel_run.sh -NC_FILES = $(TESTPROGRAMS:%=$(TESTOUTDIR)/%.nc) \ - $(TESTPROGRAMS:%=$(TESTOUTDIR)/%.bb.nc) - -CLEANFILES = $(NC_FILES) core core.* *.gcda *.gcno *.gcov gmon.out +CLEANFILES = $(TESTOUTDIR)/*.nc $(TESTOUTDIR)/*bb \ + *.nc core core.* *.gcda *.gcno *.gcov gmon.out ../common/libtestutils.la: set -e; cd ../common && $(MAKE) $(MFLAGS) tests -check_PROGRAMS = $(TESTPROGRAMS) - # autimake 1.11.3 has not yet implemented AM_TESTS_ENVIRONMENT # For newer versions, we can use AM_TESTS_ENVIRONMENT instead -# AM_TESTS_ENVIRONMENT = export TESTPROGRAMS="$(TESTPROGRAMS)"; +# AM_TESTS_ENVIRONMENT = export check_PROGRAMS="$(check_PROGRAMS)"; # AM_TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; # AM_TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT = export SED="$(SED)"; @@ -71,15 +67,27 @@ TESTS_ENVIRONMENT += export srcdir="$(srcdir)"; TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; TESTS_ENVIRONMENT += export TESTMPIRUN="$(TESTMPIRUN)"; -TESTS_ENVIRONMENT += export PNETCDF_DEBUG="$(PNETCDF_DEBUG)"; -TESTS_ENVIRONMENT += export TESTPROGRAMS="$(TESTPROGRAMS)"; TESTS_ENVIRONMENT += export check_PROGRAMS="$(check_PROGRAMS)"; -TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER="$(ENABLE_BURST_BUFFER)"; -TESTS_ENVIRONMENT += export ENABLE_NETCDF4="$(ENABLE_NETCDF4)"; + +if PNETCDF_DEBUG + TESTS_ENVIRONMENT += export PNETCDF_DEBUG=1; +endif +if ENABLE_BURST_BUFFER + TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER=1; +endif +if ENABLE_NETCDF4 + TESTS_ENVIRONMENT += export ENABLE_NETCDF4=1; +endif +if ENABLE_THREAD_SAFE + TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE=1; +endif +if MIMIC_LUSTRE + TESTS_ENVIRONMENT += export MIMIC_LUSTRE=1; +endif TESTS = $(check_PROGRAMS) TEST_EXTENSIONS = .sh -LOG_COMPILER = $(srcdir)/wrap_runs.sh +LOG_COMPILER = $(srcdir)/seq_runs.sh SH_LOG_COMPILER = # Some of these tests are designed to run on one processes, @@ -96,7 +104,7 @@ ptests: ptest4 ptest2 ptest6 ptest8 ptest10: # build check targets but not invoke -tests-local: all $(TESTPROGRAMS) +tests-local: all $(check_PROGRAMS) .PHONY: ptest ptests ptest2 ptest4 ptest6 ptest8 ptest10 diff --git a/test/largefile/bigrecords.f b/test/largefile/bigrecords.f index 51715e8ed8..08114b6d9d 100644 --- a/test/largefile/bigrecords.f +++ b/test/largefile/bigrecords.f @@ -88,29 +88,36 @@ program main + 377.5, 367.59, 360.06, 353.85999, 348.66, 342.5, 336, 328.5, 320, + 310, 300, 290, 280, 270, 260, 250, 240, 230, 220, 210, 199.10001/ - character(LEN=256) filename, cmd, msg - + character(LEN=256) out_path, in_path, cmd, msg + logical keep_files + double precision timing ! attribute vectors ! enter define mode ! iret = nf_create('pressure_19010101_000000.nc', OR(NF_CLOBBER,NF_64BIT_OFFSET), ncid) call MPI_INIT(ierr) + + timing = MPI_Wtime() + call MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr) call MPI_COMM_SIZE(MPI_COMM_WORLD, numprocs, ierr) if (myid .EQ. 0) then - filename = "testfile.nc" - err = get_args(cmd, filename) + out_path = "testfile.nc" + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, + ierr) call MPI_Bcast(cmd, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, + ierr) - iret = nfmpi_create( MPI_COMM_WORLD, filename, + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, + + ierr) + + iret = nfmpi_create( MPI_COMM_WORLD, out_path, + IOR(NF_CLOBBER,NF_64BIT_DATA), + MPI_INFO_NULL, ncid) @@ -232,7 +239,7 @@ program main ! todo: insert code to re-open dataset, read time variable all at once ! iret = nfmpi_open ( MPI_COMM_SELF, - + filename, + + out_path, + IOR(NF_CLOBBER,NF_64BIT_DATA), + MPI_INFO_NULL, + ncid) @@ -259,10 +266,21 @@ program main ! write(6,*) "Error: time array was ", time ! endif - msg = '*** TESTING F77 '//cmd(1:XTRIM(cmd))//' for NF_64BIT_DATA' - if (myid .eq. 0) call pass_fail(0, msg) + 999 timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, + + MPI_DOUBLE_PRECISION, MPI_MAX, + + MPI_COMM_WORLD, ierr) + + if (myid .eq. 0) then + if (.NOT. keep_files) then + err = nfmpi_delete(out_path, MPI_INFO_NULL) + end if + + msg='*** TESTING F77 '//cmd(1:XTRIM(cmd))//' for NF_64BIT_DATA' + call pass_fail(0, msg, timing) + endif - 999 call MPI_FINALIZE(ierr) + call MPI_FINALIZE(ierr) end ! program main subroutine writerecs(cmd,ncid,time_id) @@ -336,7 +354,7 @@ subroutine check_err(cmd, msg, iret) print *, msg, nfmpi_strerror(iret) msg = '*** TESTING F77 '//cmd(1:XTRIM(cmd))// + ' for NF_64BIT_DATA' - call pass_fail(1, msg) + call pass_fail(1, msg, timing) stop 2 endif end ! subroutine check_err diff --git a/test/largefile/high_dim_var.c b/test/largefile/high_dim_var.c index 482e9d2688..81671b5439 100644 --- a/test/largefile/high_dim_var.c +++ b/test/largefile/high_dim_var.c @@ -24,39 +24,29 @@ #define DIMLEN 3 #define NRECS 4 -int main(int argc, char** argv) { - char filename[256], name[32]; +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) +{ + char name[32]; size_t nelms; short *buffer; - int i, j, cmode, rank, nprocs, err, nerrs=0; + int i, j, rank, nprocs, err, nerrs=0; int ncid, fvarid[NVARS], rvarid[NVARS], dimids[NDIMS], rdimids[NDIMS]; MPI_Offset start[NDIMS], count[NDIMS], stride[NDIMS]; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - memset(filename, 0, 256); - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for vars APIs on high-dim variables ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR - cmode = NC_CLOBBER; - cmode |= NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, - &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR /* define dimensions */ err = ncmpi_def_dim(ncid, "rdim", NC_UNLIMITED, &rdimids[0]); CHECK_ERR @@ -76,7 +66,6 @@ int main(int argc, char** argv) { err = ncmpi_set_fill(ncid, NC_FILL, NULL); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR - if (err != NC_NOERR) goto fn_exit; #ifdef STRONGER_CONSISTENCY ncmpi_sync(ncid); @@ -89,16 +78,17 @@ int main(int argc, char** argv) { buffer = (short*) malloc(sizeof(short) * nelms); if (buffer == NULL) { printf("Error %s at line %d: fail to allocate buffer of size %zu\n", - argv[0], __LINE__, nelms * sizeof(int)); - goto fn_exit; + basename(__FILE__), __LINE__, nelms * sizeof(short)); + nerrs++; + goto err_out; } for (i=0; i 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - } - if (malloc_size > 0) ncmpi_inq_malloc_list(); -#endif - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +err_out: + return nerrs; +} + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 0; /* test intra-node aggregation */ + opt.drv = 0; /* test PNCIO driver */ + opt.ind = 0; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "vars APIs on high-dim variables", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/largefile/large_attr.c b/test/largefile/large_attr.c index c032c6ba5c..c8e4a0e72f 100644 --- a/test/largefile/large_attr.c +++ b/test/largefile/large_attr.c @@ -28,67 +28,67 @@ #include #include -int main(int argc, char** argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) { - char filename[256], *name, *buf; + char *name, *buf; size_t i; - int rank, nprocs, err, nerrs=0; - int ncid, cmode, varid, dimid; + int err, nerrs=0, ncid, varid, dimid; MPI_Offset nelems, inq_nelems; - MPI_Info info=MPI_INFO_NULL; -#ifdef PNC_MALLOC_TRACE - int verbose=0; -#endif + int rank, nprocs, color; + MPI_Comm comm=MPI_COMM_NULL; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - /* get command-line arguments */ - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for one large ATTR", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); + color = 1; + + if (nprocs > 2) { + /* run on 2 ranks only, as this test allocates memory > 4GB per rank */ + /* split MPI_COMM_WORLD based on 'color' and use the same rank order */ + color = (rank < 2) ? 1 : 0; + MPI_Comm_split(MPI_COMM_WORLD, color, rank, &comm); } + else + comm = MPI_COMM_WORLD; + + if (!color) goto err_out; nelems = (MPI_Offset)NC_MAX_INT + 17; buf = (char*) malloc(nelems); + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + /* create a new file and put a large global attribute -------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); + err = ncmpi_create(comm, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR /* put a large (> 2GiB) global attribute */ for (i=0; i 0) goto err_out; /* open the file and read back the large global attribute ---------------*/ - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, info, &ncid); + err = ncmpi_open(comm, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR - if (err != NC_NOERR) goto err_out; err = ncmpi_inq_attlen(ncid, NC_GLOBAL, "large_attr", &inq_nelems); + CHECK_ERR if (inq_nelems != nelems) { printf("Error at %s line %d: expecting attr nelems "OFFFMT" but got "OFFFMT"\n", __FILE__,__LINE__,nelems,inq_nelems); nerrs++; + goto err_out; } for (i=0; i 0) goto err_out; /* create a new file and put a large local attribute -------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); + err = ncmpi_create(comm, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "time", NC_UNLIMITED, &dimid); @@ -122,15 +121,14 @@ int main(int argc, char** argv) /* put a large (> 2GiB) global attribute */ for (i=0; i 0) goto err_out; /* open the file and read back the large global attribute ---------------*/ - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, info, &ncid); + err = ncmpi_open(comm, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR if (err != NC_NOERR) goto err_out; @@ -142,6 +140,7 @@ int main(int argc, char** argv) printf("Error at %s line %d: expecting attr len "OFFFMT" but got "OFFFMT"\n", __FILE__,__LINE__,nelems,inq_nelems); nerrs++; + goto err_out; } for (i=0; i 0) goto err_out; /* create a new file and put 2 global attributes, total size > 2 GiB ----*/ nelems /= 2; - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); + err = ncmpi_create(comm, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR /* put two global attributes (total size > 2GiB) */ for (i=0; i 0) goto err_out; /* open the file and read back the large global attributes --------------*/ - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, info, &ncid); + err = ncmpi_open(comm, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR if (err != NC_NOERR) goto err_out; @@ -193,6 +189,7 @@ int main(int argc, char** argv) printf("Error at %s line %d: expecting attr %s nelems "OFFFMT" but got "OFFFMT"\n", __FILE__,__LINE__,name, nelems,inq_nelems); nerrs++; + goto err_out; } for (i=0; i 0) goto err_out; /* create a new file and put 2 local attributes, total size > 2 GiB -----*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); + err = ncmpi_create(comm, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "time", NC_UNLIMITED, &dimid); @@ -250,20 +246,19 @@ int main(int argc, char** argv) /* put two local attributes (total size > 2GiB) */ name = "large_attr_0"; err = ncmpi_put_att_text(ncid, varid, name, nelems, buf); - if (!(cmode & NC_64BIT_DATA)) EXP_ERR(NC_EINVAL) + if (format != NC_FORMAT_64BIT_DATA) EXP_ERR(NC_EINVAL) else CHECK_ERR name = "large_attr_1"; err = ncmpi_put_att_text(ncid, varid, name, nelems, buf); - if (!(cmode & NC_64BIT_DATA)) EXP_ERR(NC_EINVAL) + if (format != NC_FORMAT_64BIT_DATA) EXP_ERR(NC_EINVAL) else CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR - if (nerrs > 0) goto err_out; /* open the file and read back the two local attributes -----------------*/ - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, info, &ncid); + err = ncmpi_open(comm, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR if (err != NC_NOERR) goto err_out; @@ -276,6 +271,7 @@ int main(int argc, char** argv) printf("Error at %s line %d: expecting attr %s len "OFFFMT" but got "OFFFMT"\n", __FILE__,__LINE__,name,nelems,inq_nelems); nerrs++; + goto err_out; } for (i=0; i 0) goto err_out; free(buf); -#ifdef PNC_MALLOC_TRACE - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) { - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - } - } - if (malloc_size > 0) ncmpi_inq_malloc_list(); +err_out: + if (comm != MPI_COMM_WORLD && comm != MPI_COMM_NULL) + MPI_Comm_free(&comm); - if (verbose) { - err = ncmpi_inq_malloc_max_size(&malloc_size); - printf("\n%d: PnetCDF internal memory footprint high water mark %.2f MB\n", - rank, (float)malloc_size/1048576); - } -#endif + return nerrs; +} -err_out: - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 0; /* test intra-node aggregation */ + opt.drv = 0; /* test PNCIO driver */ + opt.ind = 0; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "one large ATTR", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/largefile/large_coalesce.c b/test/largefile/large_coalesce.c index 6b4ece55dc..2678efb079 100644 --- a/test/largefile/large_coalesce.c +++ b/test/largefile/large_coalesce.c @@ -26,36 +26,33 @@ #define TWO_G 2147483648LL #define ONE_G 1073741824LL -int main(int argc, char** argv) +static +int test_io_nc5(const char *out_path, + MPI_Info global_info) { - char filename[256]; unsigned char *buf; - int rank, nprocs, err, nerrs=0; - int ncid, cmode, varid, dimid[2], req[3], st[3]; + int rank, nprocs, color, err, nerrs=0; + int ncid, varid, dimid[2], req[3], st[3]; MPI_Offset start[2], count[2]; MPI_Info info; size_t i; + MPI_Comm comm=MPI_COMM_NULL; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - /* get command-line arguments */ - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for skip filetype buftype coalesce ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); + color = 1; + + if (nprocs > 2) { + /* run on 2 ranks only, as this test allocates memory > 4GB per rank */ + /* split MPI_COMM_WORLD based on 'color' and use the same rank order */ + color = (rank < 2) ? 1 : 0; + MPI_Comm_split(MPI_COMM_WORLD, color, rank, &comm); } + else + comm = MPI_COMM_WORLD; + + if (!color) goto err_out; buf = (unsigned char*) calloc(TWO_G+1024,1); if (buf == NULL) { @@ -64,90 +61,16 @@ int main(int argc, char** argv) return 1; } - MPI_Info_create(&info); + MPI_Info_dup(global_info, &info); MPI_Info_set(info, "romio_cb_write", "enable"); MPI_Info_set(info, "romio_ds_read", "disable"); /* run slow without it */ - /* silence iternal debug messages */ + /* silence internal debug messages */ setenv("PNETCDF_SAFE_MODE", "0", 1); -#ifdef ENABLE_NETCDF4 - /* Test for NetCDF 4 first as ncvalidator checks only read classic files */ - - /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_NETCDF4; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); - CHECK_ERR - - /* define dimensions */ - err = ncmpi_def_dim(ncid, "NPROCS", nprocs, &dimid[0]); - CHECK_ERR - - err = ncmpi_def_dim(ncid, "X", TWO_G+1024, &dimid[1]); - CHECK_ERR - - /* define a big 1D variable of ubyte type */ - err = ncmpi_def_var(ncid, "big_var", NC_UBYTE, 2, dimid, &varid); - CHECK_ERR - - /* do not forget to exit define mode */ - err = ncmpi_enddef(ncid); - CHECK_ERR - - /* now we are in data mode */ - for (i=0; i<20; i++) buf[ONE_G-10+i] = 'a'+i; - for (i=0; i<20; i++) buf[TWO_G-10+i] = 'A'+i; - - start[0] = rank; - count[0] = 1; - - start[1] = 0; - count[1] = 10; - err = ncmpi_put_vara_uchar_all(ncid, varid, start, count, buf); - CHECK_ERR - - /* 2nd request is not contiguous from the first */ - start[1] = 1024; - count[1] = ONE_G-1024; - err = ncmpi_put_vara_uchar_all(ncid, varid, start, count, buf+1024); - CHECK_ERR - - /* make file access and write buffer of 3rd request contiguous from the 2nd - * request to check whether the internal fileview and buftype coalescing - * are skipped */ - start[1] = ONE_G; - count[1] = ONE_G+1024; - err = ncmpi_put_vara_uchar_all(ncid, varid, start, count, buf+ONE_G); - CHECK_ERR - - start[1] = 0; - count[1] = 10; - err = ncmpi_get_vara_uchar_all(ncid, varid, start, count, buf); - CHECK_ERR - - start[1] = 1024; - count[1] = ONE_G-1024; - err = ncmpi_get_vara_uchar_all(ncid, varid, start, count, buf+1024); - CHECK_ERR - - start[1] = ONE_G; - count[1] = ONE_G+1024; - err = ncmpi_get_vara_uchar_all(ncid, varid, start, count, buf+ONE_G); - CHECK_ERR - - err = ncmpi_close(ncid); CHECK_ERR - - /* check if open to read header fine */ - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid); CHECK_ERR - err = ncmpi_close(ncid); CHECK_ERR -#endif - /* Test classic format */ - /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); + err = ncmpi_create(comm, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR - MPI_Info_free(&info); /* define dimensions */ err = ncmpi_def_dim(ncid, "NPROCS", nprocs, &dimid[0]); @@ -259,31 +182,186 @@ int main(int argc, char** argv) err = ncmpi_close(ncid); CHECK_ERR /* check if open for reading header */ - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid); CHECK_ERR + err = ncmpi_open(comm, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR free(buf); -#ifdef PNC_MALLOC_TRACE - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); + MPI_Info_free(&info); + +err_out: + if (comm != MPI_COMM_WORLD && comm != MPI_COMM_NULL) + MPI_Comm_free(&comm); + + return nerrs; +} + +static +int test_io_nc4(const char *out_path, + MPI_Info global_info) +{ + unsigned char *buf; + int rank, nprocs, color, err, nerrs=0; + int ncid, varid, dimid[2]; + MPI_Offset start[2], count[2]; + MPI_Info info; + size_t i; + MPI_Comm comm=MPI_COMM_NULL; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + color = 1; + + if (nprocs > 2) { + /* run on 2 ranks only, as this test allocates memory > 4GB per rank */ + /* split MPI_COMM_WORLD based on 'color' and use the same rank order */ + color = (rank < 2) ? 1 : 0; + MPI_Comm_split(MPI_COMM_WORLD, color, rank, &comm); } - if (malloc_size > 0) ncmpi_inq_malloc_list(); -#endif + else + comm = MPI_COMM_WORLD; + + if (!color) goto err_out; - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + buf = (unsigned char*) calloc(TWO_G+1024,1); + if (buf == NULL) { + printf("malloc failed for size "OFFFMT"\n", TWO_G+1024); + MPI_Finalize(); + return 1; } - MPI_Finalize(); - return (nerrs > 0); + MPI_Info_dup(global_info, &info); + MPI_Info_set(info, "romio_cb_write", "enable"); + MPI_Info_set(info, "romio_ds_read", "disable"); /* run slow without it */ + + /* silence internal debug messages */ + setenv("PNETCDF_SAFE_MODE", "0", 1); + + /* Test for NetCDF 4 first as ncvalidator checks only read classic files */ + + /* create a new file for writing ----------------------------------------*/ + err = ncmpi_create(comm, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR + + /* define dimensions */ + err = ncmpi_def_dim(ncid, "NPROCS", nprocs, &dimid[0]); + CHECK_ERR + + err = ncmpi_def_dim(ncid, "X", TWO_G+1024, &dimid[1]); + CHECK_ERR + + /* define a big 1D variable of ubyte type */ + err = ncmpi_def_var(ncid, "big_var", NC_UBYTE, 2, dimid, &varid); + CHECK_ERR + + /* do not forget to exit define mode */ + err = ncmpi_enddef(ncid); + CHECK_ERR + + /* now we are in data mode */ + for (i=0; i<20; i++) buf[ONE_G-10+i] = 'a'+i; + for (i=0; i<20; i++) buf[TWO_G-10+i] = 'A'+i; + + start[0] = rank; + count[0] = 1; + + start[1] = 0; + count[1] = 10; + err = ncmpi_put_vara_uchar_all(ncid, varid, start, count, buf); + CHECK_ERR + + /* 2nd request is not contiguous from the first */ + start[1] = 1024; + count[1] = ONE_G-1024; + err = ncmpi_put_vara_uchar_all(ncid, varid, start, count, buf+1024); + CHECK_ERR + + /* make file access and write buffer of 3rd request contiguous from the 2nd + * request to check whether the internal fileview and buftype coalescing + * are skipped */ + start[1] = ONE_G; + count[1] = ONE_G+1024; + err = ncmpi_put_vara_uchar_all(ncid, varid, start, count, buf+ONE_G); + CHECK_ERR + + start[1] = 0; + count[1] = 10; + err = ncmpi_get_vara_uchar_all(ncid, varid, start, count, buf); + CHECK_ERR + + start[1] = 1024; + count[1] = ONE_G-1024; + err = ncmpi_get_vara_uchar_all(ncid, varid, start, count, buf+1024); + CHECK_ERR + + start[1] = ONE_G; + count[1] = ONE_G+1024; + err = ncmpi_get_vara_uchar_all(ncid, varid, start, count, buf+ONE_G); + CHECK_ERR + + err = ncmpi_close(ncid); CHECK_ERR + + /* check if open to read header fine */ + err = ncmpi_open(comm, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR + err = ncmpi_close(ncid); CHECK_ERR + + free(buf); + + MPI_Info_free(&info); + +err_out: + if (comm != MPI_COMM_WORLD && comm != MPI_COMM_NULL) + MPI_Comm_free(&comm); + + return nerrs; +} + +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) +{ + int err; + + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + if (format == NC_FORMAT_NETCDF4) + return test_io_nc4(out_path, info); + else + return test_io_nc5(out_path, info); } +int main(int argc, char **argv) { + + int err; +#ifdef ENABLE_NETCDF4 + int formats[] = {NC_FORMAT_NETCDF4, NC_FORMAT_64BIT_DATA}; +#else + int formats[] = {NC_FORMAT_64BIT_DATA}; +#endif + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 0; /* test intra-node aggregation */ + opt.drv = 0; /* test PNCIO driver */ + opt.ind = 0; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "skip filetype buftype coalesce", opt, test_io); + + MPI_Finalize(); + + return err; +} diff --git a/test/largefile/large_dims_vars_attrs.c b/test/largefile/large_dims_vars_attrs.c index e924c5087d..68e76890cf 100644 --- a/test/largefile/large_dims_vars_attrs.c +++ b/test/largefile/large_dims_vars_attrs.c @@ -27,42 +27,43 @@ #define LARGE_NUM 102400 -int main(int argc, char** argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) { - char filename[256], str[32]; - int i, rank, nprocs, err, nerrs=0; - int ncid, cmode, *varid, *dimids, intBuf[1]; -#ifdef PNC_MALLOC_TRACE - int verbose=0; -#endif + char str[32]; + int i, rank, nprocs, color, err, nerrs=0; + int ncid, *varid, *dimids, intBuf[1]; + MPI_Comm comm=MPI_COMM_NULL; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - /* get command-line arguments */ - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for large DIMS, VARS, ATTRS ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); + color = 1; + + if (nprocs > 2) { + /* run on 2 ranks only, as this test allocates memory > 4GB per rank */ + /* split MPI_COMM_WORLD based on 'color' and use the same rank order */ + color = (rank < 2) ? 1 : 0; + MPI_Comm_split(MPI_COMM_WORLD, color, rank, &comm); } + else + comm = MPI_COMM_WORLD; + + if (!color) goto err_out; dimids = (int*) malloc(sizeof(int) * LARGE_NUM); varid = (int*) malloc(sizeof(int) * LARGE_NUM); + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = ncmpi_create(comm, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR for (i=0; i 0) { - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - } - } - if (malloc_size > 0) ncmpi_inq_malloc_list(); +err_out: + if (comm != MPI_COMM_WORLD && comm != MPI_COMM_NULL) + MPI_Comm_free(&comm); - if (verbose) { - err = ncmpi_inq_malloc_max_size(&malloc_size); - printf("\n%d: PnetCDF internal memory footprint high water mark %.2f MB\n", - rank, (float)malloc_size/1048576); - } -#endif + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 0; /* test intra-node aggregation */ + opt.drv = 0; /* test PNCIO driver */ + opt.ind = 0; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "large DIMS, VARS, ATTRS", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/largefile/large_files.c b/test/largefile/large_files.c index 74b30484b2..364bfb3b84 100644 --- a/test/largefile/large_files.c +++ b/test/largefile/large_files.c @@ -16,14 +16,7 @@ #include #include -#define CHECK_ERR { \ - if (err != NC_NOERR) { \ - nerrs++; \ - printf("Error at line %d in %s: (%s)\n", \ - __LINE__,__FILE__,ncmpi_strerrno(err)); \ - goto fn_exit; \ - } \ -} +#include #define NUMRECS 1 #define I_LEN 4104 @@ -31,10 +24,14 @@ #define K_LEN 1023 #define N_LEN 2 -static int -test_large_file(char *filename, int fmt_flag) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) { - int err, nerrs=0, ncid, varid, x_id; + int err, nerrs=0, rank, ncid, varid, x_id; int n, rec, i, j, k, dims[4]; MPI_Offset start[4] = {0, 0, 0, 0}; MPI_Offset count[4] = {1, 1, J_LEN, K_LEN}; @@ -42,11 +39,17 @@ test_large_file(char *filename, int fmt_flag) /* I/O buffers */ signed char *buf; - printf("\n*** Testing large files, slowly.\n"); - printf("*** Creating large file %s...", filename); + // printf("\n*** Testing large files, slowly.\n"); + // printf("*** Creating large file %s...", out_path); + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (rank > 0) goto err_out; + + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR - err = ncmpi_create(MPI_COMM_SELF, filename, NC_CLOBBER|fmt_flag, - MPI_INFO_NULL, &ncid); + err = ncmpi_create(MPI_COMM_SELF, out_path, NC_CLOBBER, info, &ncid); if (err != NC_NOERR) { printf("Error at line %d in %s: (%s)\n", __LINE__,__FILE__,ncmpi_strerrno(err)); return 1; @@ -97,10 +100,10 @@ test_large_file(char *filename, int fmt_flag) } err = ncmpi_close(ncid); CHECK_ERR - printf("ok\n"); - printf("*** Reading large file %s...", filename); + // printf("ok\n"); + // printf("*** Reading large file %s...", out_path); - err = ncmpi_open(MPI_COMM_SELF, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid); + err = ncmpi_open(MPI_COMM_SELF, out_path, NC_NOWRITE, MPI_INFO_NULL, &ncid); CHECK_ERR /* read variables and check their contents */ @@ -123,7 +126,7 @@ test_large_file(char *filename, int fmt_flag) printf("Error on read, var[%d, %d, %d, %d] = %d wrong, should be %d !\n", rec, i, j, k, buf[j*K_LEN+k], (signed char)n); nerrs++; - goto fn_exit; + goto err_out; } n = (n == 127) ? 0 : (n+1); } @@ -141,49 +144,36 @@ test_large_file(char *filename, int fmt_flag) } } err = ncmpi_close(ncid); CHECK_ERR - -fn_exit: free(buf); - return nerrs; -} -int -main(int argc, char **argv) -{ - char filename[256]; - int nerrs=0, rank; +err_out: + MPI_Barrier(MPI_COMM_WORLD); - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); + return nerrs; +} - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - memset(filename, 0, 256); - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); +int main(int argc, char **argv) { - if (rank > 0) goto prog_exit; + int err; + int formats[] = {NC_FORMAT_64BIT_DATA}; + loop_opts opt; - /* Test NetCDF 4 first as ncvalidator checks only classic files */ -#ifdef ENABLE_NETCDF4 - nerrs += test_large_file(filename, NC_NETCDF4); -#endif + MPI_Init(&argc, &argv); - /* Test CDF-5 format */ - nerrs += test_large_file(filename, NC_64BIT_DATA); + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 0; /* test PNCIO driver */ + opt.ind = 0; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ - if (nerrs == 0) { - printf("ok\n"); - printf("*** Tests successful!\n"); - } - else - printf("\n*** Tests failed!\n"); + err = tst_main(argc, argv, "> 4 GiB file", opt, test_io); -prog_exit: MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/largefile/large_header.c b/test/largefile/large_header.c index a376496609..5945e44d00 100644 --- a/test/largefile/large_header.c +++ b/test/largefile/large_header.c @@ -22,37 +22,26 @@ #include #include -int main(int argc, char** argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) { - char filename[256]; int rank, nprocs, err, nerrs=0; - int ncid, cmode, dimid, varid, buf; + int ncid, dimid, varid, buf; MPI_Offset extent, start; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - /* get command-line arguments */ - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for large header ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR /* define a dimension of size = nprocs */ @@ -79,10 +68,8 @@ int main(int argc, char** argv) err = ncmpi_close(ncid); CHECK_ERR - if (err != NC_NOERR) goto err_out; - - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, - &ncid); CHECK_ERR + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_NOWRITE, info, &ncid); + CHECK_ERR /* inquire ID of the variable */ err = ncmpi_inq_varid(ncid, "var1", &varid); @@ -94,35 +81,38 @@ int main(int argc, char** argv) CHECK_ERR if (buf != rank) { - nerrs++; printf("Error at line %d in %s: expecting read buf %d but got %d\n", __LINE__,__FILE__,rank,buf); + nerrs++; } err = ncmpi_close(ncid); CHECK_ERR -#ifdef PNC_MALLOC_TRACE - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) { - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - } - } - if (malloc_size > 0) ncmpi_inq_malloc_list(); -#endif - -err_out: - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + return nerrs; +} + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "large header", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/largefile/large_reqs.c b/test/largefile/large_reqs.c index b55cbc4f9f..098f5bb3b9 100644 --- a/test/largefile/large_reqs.c +++ b/test/largefile/large_reqs.c @@ -30,10 +30,10 @@ static int verbose; static -int tst_one_var(char *filename, MPI_Comm comm) +int tst_one_var(MPI_Comm comm, const char *filename, MPI_Info info) { size_t i, buf_len; - int rank, nprocs, err, nerrs=0, ncid, cmode, varid, dimid[3], psize[2]; + int rank, nprocs, err, nerrs=0, ncid, varid, dimid[3], psize[2]; int *buf; MPI_Offset start[3], count[3]; @@ -46,8 +46,7 @@ int tst_one_var(char *filename, MPI_Comm comm) /* Test classic CDF-5 format */ /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(comm, filename, cmode, MPI_INFO_NULL, &ncid); + err = ncmpi_create(comm, filename, NC_CLOBBER, info, &ncid); CHECK_ERR /* define dimensions Z, Y, and X */ @@ -150,11 +149,11 @@ int tst_one_var(char *filename, MPI_Comm comm) #define LEN 1024 static -int tst_vars(char *filename, MPI_Comm comm) +int tst_vars(MPI_Comm comm, const char *filename, MPI_Info info) { size_t i, buf_len; int rank, nprocs, err, nerrs=0, *buf, *buf_ptr; - int ncid, cmode, *varid, dimid[3], gap, psize[2]={0,0}; + int ncid, *varid, dimid[3], gap, psize[2]={0,0}; MPI_Offset start[3], count[3]; MPI_Comm_size(comm, &nprocs); @@ -166,8 +165,7 @@ int tst_vars(char *filename, MPI_Comm comm) /* Test classic CDF-5 format */ /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(comm, filename, cmode, MPI_INFO_NULL, &ncid); + err = ncmpi_create(comm, filename, NC_CLOBBER, info, &ncid); CHECK_ERR /* define dimensions Z, Y, and X */ @@ -269,36 +267,24 @@ int tst_vars(char *filename, MPI_Comm comm) return nerrs; } -int main(int argc, char** argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) { - char filename[256]; - int rank, nprocs, nerrs=0, color; + int rank, nprocs, err, nerrs=0, color; MPI_Comm comm; -#ifdef PNC_MALLOC_TRACE - int err; -#endif + verbose = 0; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - /* get command-line arguments */ - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for large requests ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR color = 1; @@ -313,34 +299,39 @@ int main(int argc, char** argv) if (color) { /* test one big variable */ - nerrs += tst_one_var(filename, comm); + nerrs += tst_one_var(comm, out_path, info); /* test a large number of smaller variables */ - nerrs += tst_vars(filename, comm); + nerrs += tst_vars(comm, out_path, info); } if (comm != MPI_COMM_WORLD) MPI_Comm_free(&comm); -#ifdef PNC_MALLOC_TRACE - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - } - if (malloc_size > 0) ncmpi_inq_malloc_list(); -#endif + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 0; /* test intra-node aggregation */ + opt.drv = 0; /* test PNCIO driver */ + opt.ind = 0; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "large requests", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/largefile/large_var.c b/test/largefile/large_var.c index de12b0a187..a90b320712 100644 --- a/test/largefile/large_var.c +++ b/test/largefile/large_var.c @@ -58,47 +58,27 @@ swapn(void *buf, } #endif -int main(int argc, char** argv) +static +int test_io_nc4(const char *out_path, + MPI_Info global_info) { - char filename[256]; size_t bufsize; - int i, j, rank, nprocs, err, nerrs=0, expected; - int ncid, cmode, varid, dimid[3], req[3], st[3], *buf, *buf_ptr; - MPI_Offset offset, var_offset, start[3], count[3]; - MPI_File fh; - MPI_Status status; + int i, rank, nprocs, err, nerrs=0, expected; + int ncid, varid, dimid[3], *buf; + MPI_Offset start[3], count[3]; MPI_Info info; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - /* get command-line arguments */ - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for writing to a large variable ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - - MPI_Info_create(&info); + MPI_Info_dup(global_info, &info); MPI_Info_set(info, "romio_ds_write", "disable"); MPI_Info_set(info, "romio_ds_read", "disable"); -#ifdef ENABLE_NETCDF4 /* Test NetCDF-4 feature */ + /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_NETCDF4; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR /* define dimensions Z, Y, and X */ @@ -151,7 +131,7 @@ int main(int argc, char** argv) err = ncmpi_close(ncid); CHECK_ERR /* open the same file and read back for validation */ - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, info, &ncid); + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR err = ncmpi_inq_varid(ncid, "var", &varid); CHECK_ERR @@ -248,12 +228,35 @@ int main(int argc, char** argv) err = ncmpi_close(ncid); CHECK_ERR free(buf); -#endif + + MPI_Info_free(&info); + + return nerrs; +} + +static +int test_io_nc5(const char *out_path, + MPI_Info global_info) +{ + size_t bufsize; + int i, j, rank, nprocs, err, nerrs=0, expected; + int ncid, varid, dimid[3], req[3], st[3], *buf, *buf_ptr; + MPI_Offset offset, var_offset, start[3], count[3]; + MPI_File fh; + MPI_Status status; + MPI_Info info; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + MPI_Info_dup(global_info, &info); + MPI_Info_set(info, "romio_ds_write", "disable"); + MPI_Info_set(info, "romio_ds_read", "disable"); /* Test classic format */ + /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR /* define dimensions Z, Y, and X */ @@ -315,7 +318,7 @@ int main(int argc, char** argv) err = ncmpi_close(ncid); CHECK_ERR /* open the same file and read back for validation */ - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, info, &ncid); + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR err = ncmpi_inq_varid(ncid, "var", &varid); CHECK_ERR @@ -413,7 +416,7 @@ int main(int argc, char** argv) err = ncmpi_close(ncid); CHECK_ERR /* MPI file open the same file and read back for validation */ - err = MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); + err = MPI_File_open(MPI_COMM_WORLD, out_path, MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); if (err != MPI_SUCCESS) { int errorStringLen; char errorString[MPI_MAX_ERROR_STRING]; @@ -547,26 +550,54 @@ int main(int argc, char** argv) free(buf); MPI_Info_free(&info); -#ifdef PNC_MALLOC_TRACE - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - } - if (malloc_size > 0) ncmpi_inq_malloc_list(); + return nerrs; +} + +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) +{ + int err; + + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + if (format == NC_FORMAT_NETCDF4) + return test_io_nc4(out_path, info); + + return test_io_nc5(out_path, info); +} + +int main(int argc, char **argv) { + + int err; +#ifdef ENABLE_NETCDF4 + int formats[] = {NC_FORMAT_NETCDF4, NC_FORMAT_64BIT_DATA}; +#else + int formats[] = {NC_FORMAT_64BIT_DATA}; #endif + loop_opts opt; - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "writing to a large variable", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/largefile/parallel_run.sh b/test/largefile/parallel_run.sh index 8df4783e3f..da174c4a67 100755 --- a/test/largefile/parallel_run.sh +++ b/test/largefile/parallel_run.sh @@ -1,47 +1,56 @@ #!/bin/bash # -# Copyright (C) 2018, Northwestern University and Argonne National Laboratory +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. # # Exit immediately if a command exits with a non-zero status. set -e -VALIDATOR=../../src/utils/ncvalidator/ncvalidator -NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff +DRY_RUN=no +VERBOSE=no -# remove file system type prefix if there is any -OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $MPIRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $MPIRUN $@ + fi +} MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "MPIRUN = ${MPIRUN}" # echo "check_PROGRAMS=${check_PROGRAMS}" -# turn off safe mode for large tests -export PNETCDF_SAFE_MODE=0 +if test "x${PNETCDF_DEBUG}" = x1 ; then + safe_modes="0 1" +else + safe_modes="0" +fi # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS +if test "x$MIMIC_LUSTRE" != x1 ; then + PNETCDF_HINTS="cb_nodes=2" +fi + for i in ${check_PROGRAMS} ; do - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc - - # echo "--- validating file ${TESTOUTDIR}/$i.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc - rm -f ${OUTDIR}/$i.nc - # echo "" - - if test "x${ENABLE_BURST_BUFFER}" = x1 ; then - # echo "---- test burst buffering feature" - export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.bb.nc - unset PNETCDF_HINTS - - # echo "--- validating file ${TESTOUTDIR}/$i.bb.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.nc - fi - - rm -f ${OUTDIR}/$i.bb.nc -done + + exe_name=`basename $i` + + for j in ${safe_modes} ; do + + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi + + run_cmd ./$i -q -o ${TESTOUTDIR}/${exe_name}.nc + + done # safe_modes + +done # check_PROGRAMS diff --git a/test/largefile/seq_runs.sh b/test/largefile/seq_runs.sh index e28955773d..205d1fb396 100755 --- a/test/largefile/seq_runs.sh +++ b/test/largefile/seq_runs.sh @@ -1,25 +1,44 @@ -#!/bin/sh +#!/bin/bash # -# Copyright (C) 2003, Northwestern University and Argonne National Laboratory +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. # # Exit immediately if a command exits with a non-zero status. set -e -VALIDATOR=../../src/utils/ncvalidator/ncvalidator +DRY_RUN=no +VERBOSE=no -# remove file system type prefix if there is any -OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $TESTSEQRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $TESTSEQRUN $@ + fi +} -# disable safe mode, as large tests already run slow -export PNETCDF_SAFE_MODE=0 +if test "x${PNETCDF_DEBUG}" = x1 ; then + safe_modes="0 1" +else + safe_modes="0" +fi + +exe_name=`basename $1` # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS -for i in ${TESTPROGRAMS}; do - ${TESTSEQRUN} ./$i ${TESTOUTDIR}/$i.nc - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc - rm -f ${OUTDIR}/$i.nc -done +for j in ${safe_modes} ; do + + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi + + run_cmd ./$1 -q -o ${TESTOUTDIR}/${exe_name}.nc + +done # safe_modes + diff --git a/test/largefile/tst_cdf5_begin.c b/test/largefile/tst_cdf5_begin.c index b965fba707..e963fd6bac 100644 --- a/test/largefile/tst_cdf5_begin.c +++ b/test/largefile/tst_cdf5_begin.c @@ -83,33 +83,26 @@ typedef MPI_Offset len_t; * contents of the possible overlaps between the two variables. */ -int main(int argc, char** argv) { - char filename[256]; +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) +{ int i, err, rank, nprocs, nerrs=0, ncid, dimid[2], varid[2]; short buf[10]; len_t start[1], count[1]; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for checking CDF-5 writes", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR - err = FileCreate(MPI_COMM_WORLD, filename, NC_CLOBBER|NC_64BIT_DATA, - MPI_INFO_NULL, &ncid); CHECK_ERR + err = FileCreate(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR err = DefDim(ncid, "dim0", NC_MAX_UINT, &dimid[0]); CHECK_ERR err = DefDim(ncid, "dim1", 10, &dimid[1]); CHECK_ERR @@ -142,31 +135,44 @@ int main(int argc, char** argv) { err = FileClose(ncid); CHECK_ERR /* check if open to read header fine */ - err = FileOpen(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid); CHECK_ERR + err = FileOpen(MPI_COMM_WORLD, out_path, NC_NOWRITE, info, &ncid); + CHECK_ERR err = FileClose(ncid); CHECK_ERR #ifdef TEST_NETCDF if (nerrs) printf("fail with %d mismatches\n",nerrs); else printf("pass\n"); -#else -#ifdef PNC_MALLOC_TRACE - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR && malloc_size > 0) /* this test is for running 1 process */ - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - malloc_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); #endif - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + return nerrs; +} + +int main(int argc, char **argv) { + + int err; +#ifdef ENABLE_NETCDF4 + int formats[] = {NC_FORMAT_NETCDF4, NC_FORMAT_64BIT_DATA}; +#else + int formats[] = {NC_FORMAT_64BIT_DATA}; #endif + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "CDF-5 writes", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/largefile/tst_flarge.f90 b/test/largefile/tst_flarge.f90 index 7d557add52..f72cd92b4e 100644 --- a/test/largefile/tst_flarge.f90 +++ b/test/largefile/tst_flarge.f90 @@ -26,22 +26,29 @@ program tst_flarge integer :: cmode, err, ierr, get_args double precision dbl_buf(1) integer(KIND=MPI_OFFSET_KIND) :: start(1), count(1) - character(LEN=256) filename, cmd, msg + character(LEN=256) out_path, in_path, cmd, msg integer my_rank, p + logical keep_files + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_rank(MPI_COMM_WORLD, my_rank, ierr) call MPI_Comm_size(MPI_COMM_WORLD, p, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any if (my_rank .EQ. 0) then - filename = FILE_NAME - err = get_args(cmd, filename) + out_path = FILE_NAME + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) ! if (p .ne. 1 .AND. my_rank .eq. 0) then ! print *, 'Warning: ',trim(cmd),' is design to run on 1 process' @@ -49,7 +56,7 @@ program tst_flarge ! Create the file with 2 NF90_DOUBLE vars, each with one really long dimension. cmode = IOR(NF90_CLOBBER, NF90_64BIT_DATA) - call check(nf90mpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, ncFileID)) + call check(nf90mpi_create(MPI_COMM_WORLD, out_path, cmode, MPI_INFO_NULL, ncFileID)) call check(nf90mpi_def_dim(ncFileID, dimName, BIG_DIMENSION, dimID)) call check(nf90mpi_def_var(ncFileID, var1Name, nf90_double, (/ dimID /), varID1) ) call check(nf90mpi_def_var(ncFileID, var2Name, nf90_double, (/ dimID /), varID2) ) @@ -70,7 +77,7 @@ program tst_flarge call check(nf90mpi_close(ncFileID)) ! Now open the file to read and check a few values - call check(nf90mpi_open(MPI_COMM_WORLD, filename, NF90_NOWRITE, MPI_INFO_NULL, ncFileID)) + call check(nf90mpi_open(MPI_COMM_WORLD, out_path, NF90_NOWRITE, MPI_INFO_NULL, ncFileID)) call check(nf90mpi_begin_indep_data(ncFileID)) start(1) = 1 call check(nf90mpi_get_var(ncFileID, VarID1, val1_in, start)) @@ -83,10 +90,21 @@ program tst_flarge call check(nf90mpi_close(ncFileID)) - msg = '*** TESTING F90 '//trim(cmd)//' for large files' - if (my_rank .eq. 0) call pass_fail(0, msg) + 999 timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) + + if (my_rank .eq. 0) then + if (.NOT. keep_files) then + err = nf90mpi_delete(out_path, MPI_INFO_NULL) + end if + + msg = '*** TESTING F90 '//trim(cmd)//' for large files' + call pass_fail(0, msg, timing) + endif - 999 call MPI_Finalize(ierr) + call MPI_Finalize(ierr) contains ! Internal subroutine - checks error status after each netcdf, prints out text message each time diff --git a/test/largefile/tst_hash_large_ndims.c b/test/largefile/tst_hash_large_ndims.c index f58eb41c00..954e1593eb 100644 --- a/test/largefile/tst_hash_large_ndims.c +++ b/test/largefile/tst_hash_large_ndims.c @@ -23,49 +23,34 @@ #define NDIMS 400000 -int main(int argc, char** argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) { - char filename[256]; - int i, rank, nprocs, err, nerrs=0, ncid, cmode, dimid, verbose=1; + size_t j; + int rank, nprocs, err, nerrs=0, ncid, dimid, verbose=0; double timing[3], max_timing[3]; #ifdef PNC_MALLOC_TRACE - MPI_Offset malloc_size[2], sum_size, max_size[2]; + MPI_Offset malloc_size[2], max_size[2]; #endif - MPI_Info info; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - /* get command-line arguments */ - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for hasing large ndims ", - basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - if (verbose && rank == 0) printf("\nNDIMS = %d\n", NDIMS); - MPI_Info_create(&info); MPI_Info_set(info, "nc_hash_size_dim", "2048"); - /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); CHECK_ERR - MPI_Info_free(&info); + /* create a new file for writing ----------------------------------------*/ + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR #ifdef PNC_MALLOC_TRACE err = ncmpi_inq_malloc_size(&malloc_size[0]); CHECK_ERR @@ -82,9 +67,9 @@ int main(int argc, char** argv) MPI_Barrier(MPI_COMM_WORLD); timing[0] = MPI_Wtime(); - for (i=0; i 0) { - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - } - } if (malloc_size[0] > 0) ncmpi_inq_malloc_list(); #endif @@ -163,13 +139,31 @@ int main(int argc, char** argv) if (verbose && rank == 0) printf("Time ncmpi_def_dim = %.4f ncmpi_enddef = %.4f ncmpi_close = %.4f\n", max_timing[0],max_timing[1],max_timing[2]); - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + return nerrs; +} + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 0; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 0; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, " hashing large ndims", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/largefile/tst_hash_large_ngattrs.c b/test/largefile/tst_hash_large_ngattrs.c index c6732c5d5f..9714075a3c 100644 --- a/test/largefile/tst_hash_large_ngattrs.c +++ b/test/largefile/tst_hash_large_ngattrs.c @@ -23,49 +23,33 @@ #define NATTRS 400000 -int main(int argc, char** argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) { - char filename[256]; - int i, rank, nprocs, err, nerrs=0, ncid, cmode, verbose=1; + size_t j; + int rank, err, nerrs=0, ncid, verbose=0; double timing[3], max_timing[3]; #ifdef PNC_MALLOC_TRACE - MPI_Offset malloc_size[2], sum_size, max_size[2]; + MPI_Offset malloc_size[2], max_size[2]; #endif - MPI_Info info; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - - /* get command-line arguments */ - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for hashing of large gattr ", - basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } if (verbose && rank == 0) printf("\nNATTRS = %d\n", NATTRS); - MPI_Info_create(&info); MPI_Info_set(info, "nc_hash_size_gattr", "2048"); - /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); CHECK_ERR - MPI_Info_free(&info); + /* create a new file for writing ----------------------------------------*/ + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR #ifdef PNC_MALLOC_TRACE err = ncmpi_inq_malloc_size(&malloc_size[0]); CHECK_ERR @@ -82,10 +66,11 @@ int main(int argc, char** argv) MPI_Barrier(MPI_COMM_WORLD); timing[0] = MPI_Wtime(); - for (i=0; i 0) { - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - } - } if (malloc_size[0] > 0) ncmpi_inq_malloc_list(); #endif @@ -163,13 +139,31 @@ int main(int argc, char** argv) if (verbose && rank == 0) printf("Time ncmpi_put_att = %.4f ncmpi_enddef = %.4f ncmpi_close = %.4f\n", max_timing[0],max_timing[1],max_timing[2]); - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + return nerrs; +} + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 0; /* test intra-node aggregation */ + opt.drv = 0; /* test PNCIO driver */ + opt.ind = 0; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "hashing of large number of gattr", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/largefile/tst_hash_large_nvars.c b/test/largefile/tst_hash_large_nvars.c index 5040eb8aac..dac833d77f 100644 --- a/test/largefile/tst_hash_large_nvars.c +++ b/test/largefile/tst_hash_large_nvars.c @@ -23,52 +23,36 @@ #define NVARS 400000 -int main(int argc, char** argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) { - char filename[256]; - int i, rank, nprocs, err, nerrs=0, ncid, cmode, dimid, *varid, verbose=1; + int i, rank, nprocs, err, nerrs=0, ncid, dimid, *varid, verbose=0; double timing[4], max_timing[4]; #ifdef PNC_MALLOC_TRACE - MPI_Offset malloc_size[2], sum_size, max_size[2]; + MPI_Offset malloc_size[2], max_size[2]; #endif - MPI_Info info; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - /* get command-line arguments */ - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for hashing of large nvars ", - basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - if (verbose && rank == 0) printf("\nNVARS = %d\n", NVARS); varid = (int*) malloc(sizeof(int) * NVARS); - MPI_Info_create(&info); MPI_Info_set(info, "nc_hash_size_var", "2048"); MPI_Info_set(info, "nc_hash_size_vattr", "2"); - /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); CHECK_ERR - MPI_Info_free(&info); + /* create a new file for writing ----------------------------------------*/ + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR #ifdef PNC_MALLOC_TRACE err = ncmpi_inq_malloc_size(&malloc_size[0]); CHECK_ERR @@ -181,15 +165,6 @@ int main(int argc, char** argv) printf("After ncmpi_close, PnetCDF memory footprint %4lld B\n", max_size[0]); } - - /* check if PnetCDF freed all internal malloc */ - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size[0], &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) { - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - } - } if (malloc_size[0] > 0) ncmpi_inq_malloc_list(); #endif @@ -198,13 +173,31 @@ int main(int argc, char** argv) printf("Time ncmpi_def_var = %.4f ncmpi_put_att = %.4f ncmpi_enddef = %.4f ncmpi_close = %.4f\n", max_timing[0],max_timing[1],max_timing[2],max_timing[3]); - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + return nerrs; +} + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 0; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 0; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "hashing of large number of vars", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/nc4/Makefile.am b/test/nc4/Makefile.am index b25c30dd9e..9ff73cd04c 100644 --- a/test/nc4/Makefile.am +++ b/test/nc4/Makefile.am @@ -69,20 +69,32 @@ TESTS_ENVIRONMENT += export srcdir="$(srcdir)"; TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; TESTS_ENVIRONMENT += export TESTMPIRUN="$(TESTMPIRUN)"; -TESTS_ENVIRONMENT += export PNETCDF_DEBUG="$(PNETCDF_DEBUG)"; TESTS_ENVIRONMENT += export TESTPROGRAMS="$(TESTPROGRAMS)"; TESTS_ENVIRONMENT += export check_PROGRAMS="$(check_PROGRAMS)"; +if PNETCDF_DEBUG + TESTS_ENVIRONMENT += export PNETCDF_DEBUG=1; +endif +if ENABLE_BURST_BUFFER + TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER=1; +endif +if ENABLE_NETCDF4 + TESTS_ENVIRONMENT += export ENABLE_NETCDF4=1; +endif +if ENABLE_THREAD_SAFE + TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE=1; +endif +if MIMIC_LUSTRE + TESTS_ENVIRONMENT += export MIMIC_LUSTRE=1; +endif + TESTS = $(TESTPROGRAMS) TEST_EXTENSIONS = .sh LOG_COMPILER = $(srcdir)/wrap_runs.sh SH_LOG_COMPILER = -NC_FILES = $(TESTPROGRAMS:%=$(TESTOUTDIR)/%.nc) - CLEANFILES = $(M4_SRCS:.m4=.c) \ - $(TESTOUTDIR)/put_get_all_kinds.nc.cdf4 \ - $(NC_FILES) testfile.nc + $(TESTOUTDIR)/*.nc $(TESTOUTDIR)/*bb EXTRA_DIST = $(M4_SRCS) wrap_runs.sh parallel_run.sh @@ -115,7 +127,7 @@ ptest6: $(check_PROGRAMS) @$(TESTS_ENVIRONMENT) \ $(srcdir)/parallel_run.sh 6 || exit 1 -ptests: ptest2 ptest4 ptest6 +ptests: ptest4 ptest6 ptest8 ptest10: # build check targets but not invoke diff --git a/test/nc4/compressed.c b/test/nc4/compressed.c index e9e18f9d8a..793dd189cf 100644 --- a/test/nc4/compressed.c +++ b/test/nc4/compressed.c @@ -15,12 +15,16 @@ #define FNAME "gzip_example.nc" int main(int argc, char **argv) { - int err, nerrs=0, rank, np; + int err, nerrs=0, rank; int ncid, ndims, nvars; char *dir_name=".", filename[512]; + double timing; + MPI_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &np); + + timing = MPI_Wtime(); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (argc > 2) { @@ -32,8 +36,8 @@ int main(int argc, char **argv) { if (rank == 0) { char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for reading compressed file", basename(argv[0])); - printf("%-66s ------ ", cmd_str); + sprintf(cmd_str, "*** TESTING C %s - reading compressed file", basename(argv[0])); + printf("%-63s -- ", cmd_str); free(cmd_str); } @@ -60,10 +64,12 @@ int main(int argc, char **argv) { sum_size); } + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf(PASS_STR, timing); } MPI_Finalize(); diff --git a/test/nc4/interoperability_rd.m4 b/test/nc4/interoperability_rd.m4 index 0bfa86d80c..aa2da0a0ce 100644 --- a/test/nc4/interoperability_rd.m4 +++ b/test/nc4/interoperability_rd.m4 @@ -149,7 +149,12 @@ int main(int argc, char **argv) { int did[2]; char *filename = FILE_NAME; + double timing; + MPI_Init(&argc, &argv); + + timing = MPI_Wtime(); + MPI_Comm_size(MPI_COMM_WORLD, &np); MPI_Comm_rank(MPI_COMM_WORLD, &rank); @@ -162,8 +167,8 @@ int main(int argc, char **argv) { if (rank == 0) { char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for interoperability file", basename(argv[0])); - printf("%-66s ------ ", cmd_str); + sprintf(cmd_str, "*** TESTING C %s - interoperability file", basename(argv[0])); + printf("%-63s -- ", cmd_str); free(cmd_str); } @@ -227,10 +232,12 @@ err_out: sum_size); } + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf(PASS_STR, timing); } MPI_Finalize(); diff --git a/test/nc4/interoperability_wr.m4 b/test/nc4/interoperability_wr.m4 index fcbf47cdd7..b9ead475b7 100644 --- a/test/nc4/interoperability_wr.m4 +++ b/test/nc4/interoperability_wr.m4 @@ -150,7 +150,12 @@ int main(int argc, char **argv) { int did[2]; char *filename = FILE_NAME; + double timing; + MPI_Init(&argc, &argv); + + timing = MPI_Wtime(); + MPI_Comm_size(MPI_COMM_WORLD, &np); MPI_Comm_rank(MPI_COMM_WORLD, &rank); @@ -163,8 +168,8 @@ int main(int argc, char **argv) { if (rank == 0) { char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for interoperability file", basename(argv[0])); - printf("%-66s ------ ", cmd_str); + sprintf(cmd_str, "*** TESTING C %s - interoperability file", basename(argv[0])); + printf("%-63s -- ", cmd_str); free(cmd_str); } @@ -231,10 +236,12 @@ foreach(`dt', (`(`0', `schar', `char')', dnl sum_size); } + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf(PASS_STR, timing); } MPI_Finalize(); diff --git a/test/nc4/noclobber.c b/test/nc4/noclobber.c index 6bad7447ab..6841f4aece 100644 --- a/test/nc4/noclobber.c +++ b/test/nc4/noclobber.c @@ -19,10 +19,14 @@ int main(int argc, char **argv) { char filename[256]; - int err, nerrs=0, ncid, cmode, rank, nprocs; + int err, nerrs=0, ncid, cmode, rank; + + double timing; MPI_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + timing = MPI_Wtime(); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (argc > 2) { @@ -36,8 +40,8 @@ int main(int argc, char **argv) { if (rank == 0) { char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for NC_NOCLOBBER and NC_EEXIST ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); + sprintf(cmd_str, "*** TESTING C %s - NC_NOCLOBBER and NC_EEXIST ", basename(argv[0])); + printf("%-63s -- ", cmd_str); fflush(stdout); free(cmd_str); } @@ -68,10 +72,12 @@ int main(int argc, char **argv) { if (malloc_size > 0) ncmpi_inq_malloc_list(); } + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf(PASS_STR, timing); } MPI_Finalize(); diff --git a/test/nc4/notsupport.c b/test/nc4/notsupport.c index 2165067b80..7b38b6ccd3 100644 --- a/test/nc4/notsupport.c +++ b/test/nc4/notsupport.c @@ -36,7 +36,7 @@ int main(int argc, char** argv) { char filename[256]; - int rank, nprocs, err, nerrs=0; + int rank, err, nerrs=0; int ncid, cmode, varid, dimid, buf; int v1, v2; MPI_Comm comm=MPI_COMM_WORLD; @@ -44,9 +44,13 @@ int main(int argc, char** argv) { MPI_Offset start[1] = {0}; MPI_Datatype btype; + double timing; + MPI_Init(&argc, &argv); + + timing = MPI_Wtime(); + MPI_Comm_rank(comm, &rank); - MPI_Comm_size(comm, &nprocs); if (argc > 2) { if (!rank) printf("Usage: %s [filename]\n",argv[0]); @@ -59,8 +63,8 @@ int main(int argc, char** argv) { if (rank == 0) { char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for error NC_ENOTSUPPORT ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); + sprintf(cmd_str, "*** TESTING C %s - error NC_ENOTSUPPORT ", basename(argv[0])); + printf("%-63s -- ", cmd_str); fflush(stdout); free(cmd_str); } @@ -142,10 +146,12 @@ int main(int argc, char** argv) { sum_size); } + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, comm); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf(PASS_STR, timing); } MPI_Finalize(); diff --git a/test/nc4/parallel_run.sh b/test/nc4/parallel_run.sh index 6e0dfa371b..3e7d3abb7e 100755 --- a/test/nc4/parallel_run.sh +++ b/test/nc4/parallel_run.sh @@ -15,7 +15,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "check_PROGRAMS=${check_PROGRAMS}" # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -26,20 +26,19 @@ unset PNETCDF_HINTS for i in ${check_PROGRAMS} ; do for j in ${safe_modes} ; do - for intra_aggr in 0 1 ; do + PNETCDF_HINTS= if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" - else - export PNETCDF_HINTS= + PNETCDF_HINTS="romio_no_indep_rw=true" fi - if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" + if test "x$MIMIC_LUSTRE" != x1 ; then + PNETCDF_HINTS="cb_nodes=2;$PNETCDF_HINTS" fi + export PNETCDF_HINTS="$PNETCDF_HINTS" + export PNETCDF_SAFE_MODE=$j # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc done - done rm -f ${OUTDIR}/$i.nc rm -f ${OUTDIR}/$i.nc.cdf4 done diff --git a/test/nc4/pres_temp_4D.c b/test/nc4/pres_temp_4D.c index d37acfde48..0128ea51b0 100644 --- a/test/nc4/pres_temp_4D.c +++ b/test/nc4/pres_temp_4D.c @@ -382,7 +382,12 @@ main(int argc, char ** argv) char filename[256]; int nprocs, rank, err, nerrs=0; + double timing; + MPI_Init(&argc, &argv); + + timing = MPI_Wtime(); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); @@ -397,8 +402,8 @@ main(int argc, char ** argv) if (rank == 0) { char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for NetCDF4 file", basename(argv[0])); - printf("%-66s ------ ", cmd_str); + sprintf(cmd_str, "*** TESTING C %s - NetCDF4 file", basename(argv[0])); + printf("%-63s -- ", cmd_str); free(cmd_str); } @@ -417,10 +422,12 @@ main(int argc, char ** argv) sum_size); } + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf(PASS_STR, timing); } MPI_Finalize(); diff --git a/test/nc4/put_get_all_kinds.m4 b/test/nc4/put_get_all_kinds.m4 index f5b1527482..5890e920b1 100644 --- a/test/nc4/put_get_all_kinds.m4 +++ b/test/nc4/put_get_all_kinds.m4 @@ -155,7 +155,12 @@ int main(int argc, char **argv) MPI_Offset startM[NDIMS], countM[NDIMS]; MPI_Info info; - MPI_Init(&argc,&argv); + double timing; + + MPI_Init(&argc, &argv); + + timing = MPI_Wtime(); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); @@ -170,8 +175,8 @@ int main(int argc, char **argv) if (rank == 0) { char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for all kinds put APIs ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); + sprintf(cmd_str, "*** TESTING C %s - all kinds put APIs ", basename(argv[0])); + printf("%-63s -- ", cmd_str); fflush(stdout); free(cmd_str); } @@ -242,10 +247,12 @@ int main(int argc, char **argv) sum_size); } + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf(PASS_STR, timing); } MPI_Finalize(); diff --git a/test/nc4/rd_compressed.c b/test/nc4/rd_compressed.c index 19b7551081..2f0075703a 100644 --- a/test/nc4/rd_compressed.c +++ b/test/nc4/rd_compressed.c @@ -63,12 +63,16 @@ static int create_nc4(char *filename) int main(int argc, char **argv) { char filename[512]; - int i, err, nerrs=0, rank, np; + int i, err, nerrs=0, rank; int ncid, ndims, nvars, varid, dimids[3], *buf; MPI_Offset bufLen; + double timing; + MPI_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &np); + + timing = MPI_Wtime(); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (argc > 2) { @@ -81,8 +85,8 @@ int main(int argc, char **argv) { if (rank == 0) { char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for reading compressed NetCDF4 file", basename(argv[0])); - printf("%-66s ------ ", cmd_str); + sprintf(cmd_str, "*** TESTING C %s - reading compressed NetCDF4 file", basename(argv[0])); + printf("%-63s -- ", cmd_str); free(cmd_str); } @@ -170,10 +174,12 @@ int main(int argc, char **argv) { sum_size); } + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf(PASS_STR, timing); } fn_exit: diff --git a/test/nc4/simple_xy.c b/test/nc4/simple_xy.c index a8bfe1fb43..889e864373 100644 --- a/test/nc4/simple_xy.c +++ b/test/nc4/simple_xy.c @@ -49,7 +49,7 @@ int main(int argc, char** argv) { char filename[256]; - int i, j, nerrs=0, rank, nprocs, err; + int i, j, nerrs=0, rank, err; int ncid, x_dimid, y_dimid, varid, ndim; int dimids[2]; int data_out[NX][NY]; @@ -58,9 +58,13 @@ int main(int argc, char** argv) { char tmp[1024]; int x, y; + double timing; + MPI_Init(&argc, &argv); + + timing = MPI_Wtime(); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); if (argc > 2) { if (!rank) printf("Usage: %s [filename]\n",argv[0]); @@ -74,9 +78,9 @@ int main(int argc, char** argv) { if (rank == 0) { char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); sprintf(cmd_str, - "*** TESTING C %s for opening and reading a netcdf4 file", + "*** TESTING C %s - opening and reading a netcdf4 file", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); + printf("%-63s -- ", cmd_str); fflush(stdout); free(cmd_str); } @@ -162,10 +166,12 @@ int main(int argc, char** argv) { malloc_size); fn_exit: + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf(PASS_STR, timing); } MPI_Finalize(); diff --git a/test/nc4/tst_2_rec_dims.c b/test/nc4/tst_2_rec_dims.c index 2f8d40ae9c..066d0d80f3 100644 --- a/test/nc4/tst_2_rec_dims.c +++ b/test/nc4/tst_2_rec_dims.c @@ -26,16 +26,20 @@ int main(int argc, char** argv) { char filename[512]; - int i, rank, nprocs, err, nerrs=0, buf[16]; + int i, rank, err, nerrs=0, buf[16]; int ncid, mode, dimids[2], varid, num_rec_vars; MPI_Comm comm=MPI_COMM_WORLD; MPI_Info info=MPI_INFO_NULL; MPI_Offset recsize; size_t start[2], count[2]; + double timing; + MPI_Init(&argc, &argv); + + timing = MPI_Wtime(); + MPI_Comm_rank(comm, &rank); - MPI_Comm_size(comm, &nprocs); if (argc > 2) { if (!rank) printf("Usage: %s [filename]\n",argv[0]); @@ -48,8 +52,8 @@ int main(int argc, char** argv) { if (rank == 0) { char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for reading file with 2 rec dims ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); + sprintf(cmd_str, "*** TESTING C %s - reading file with 2 rec dims ", basename(argv[0])); + printf("%-63s -- ", cmd_str); fflush(stdout); free(cmd_str); } @@ -112,10 +116,12 @@ int main(int argc, char** argv) { sum_size); } + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, comm); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf(PASS_STR, timing); } MPI_Finalize(); diff --git a/test/nc4/tst_get_put_size.c b/test/nc4/tst_get_put_size.c index 9f05ff2488..55b72fb7ba 100644 --- a/test/nc4/tst_get_put_size.c +++ b/test/nc4/tst_get_put_size.c @@ -31,7 +31,12 @@ int main(int argc, char** argv) { MPI_Offset start[2]; MPI_Offset size; + double timing; + MPI_Init(&argc, &argv); + + timing = MPI_Wtime(); + MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &nprocs); @@ -46,8 +51,8 @@ int main(int argc, char** argv) { if (rank == 0) { char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for get size and put size ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); + sprintf(cmd_str, "*** TESTING C %s - get size and put size ", basename(argv[0])); + printf("%-63s -- ", cmd_str); fflush(stdout); free(cmd_str); } @@ -98,10 +103,12 @@ int main(int argc, char** argv) { sum_size); } + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, comm); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf(PASS_STR, timing); } MPI_Finalize(); diff --git a/test/nc4/tst_rec_vars.c b/test/nc4/tst_rec_vars.c index 3286881350..a257afce46 100644 --- a/test/nc4/tst_rec_vars.c +++ b/test/nc4/tst_rec_vars.c @@ -30,7 +30,12 @@ int main(int argc, char** argv) { MPI_Info info=MPI_INFO_NULL; MPI_Offset start[1], count[1]; + double timing; + MPI_Init(&argc, &argv); + + timing = MPI_Wtime(); + MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &nprocs); @@ -45,8 +50,8 @@ int main(int argc, char** argv) { if (rank == 0) { char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for record variables to NetCDF4 file ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); + sprintf(cmd_str, "*** TESTING C %s - record variables to NetCDF4 file", basename(argv[0])); + printf("%-63s -- ", cmd_str); fflush(stdout); free(cmd_str); } @@ -136,10 +141,12 @@ int main(int argc, char** argv) { sum_size); } + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, comm); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf(PASS_STR, timing); } MPI_Finalize(); diff --git a/test/nc4/tst_zero_req.c b/test/nc4/tst_zero_req.c index 04d780c1ca..bdc3b4ad88 100644 --- a/test/nc4/tst_zero_req.c +++ b/test/nc4/tst_zero_req.c @@ -134,7 +134,12 @@ int main(int argc, char **argv) { char filename[256]; int rank=0, nerrs=0; - MPI_Init(&argc,&argv); + double timing; + + MPI_Init(&argc, &argv); + + timing = MPI_Wtime(); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (argc > 2) { @@ -148,8 +153,8 @@ int main(int argc, char **argv) { if (rank == 0) { char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for zero-length request ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); + sprintf(cmd_str, "*** TESTING C %s - zero-length request ", basename(argv[0])); + printf("%-63s -- ", cmd_str); fflush(stdout); free(cmd_str); } @@ -177,10 +182,12 @@ int main(int argc, char **argv) { if (malloc_size > 0) ncmpi_inq_malloc_list(); } + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf(PASS_STR, timing); } MPI_Finalize(); diff --git a/test/nc4/wrap_runs.sh b/test/nc4/wrap_runs.sh index 10d76802b2..0ff63a15af 100755 --- a/test/nc4/wrap_runs.sh +++ b/test/nc4/wrap_runs.sh @@ -13,7 +13,7 @@ outfile=`basename $1` OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -23,10 +23,33 @@ fi unset PNETCDF_HINTS for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi +for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + else + USEMPIO_HINTS="nc_pncio=enable" + fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.nc done +done rm -f ${OUTDIR}/$outfile.nc rm -f ${OUTDIR}/$outfile.nc.cdf4 diff --git a/test/nc_test/Makefile.am b/test/nc_test/Makefile.am index 8cd608d83d..b0742e1c9a 100644 --- a/test/nc_test/Makefile.am +++ b/test/nc_test/Makefile.am @@ -65,29 +65,33 @@ TESTS_ENVIRONMENT += export srcdir="$(srcdir)"; TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; TESTS_ENVIRONMENT += export TESTMPIRUN="$(TESTMPIRUN)"; -TESTS_ENVIRONMENT += export PNETCDF_DEBUG="$(PNETCDF_DEBUG)"; TESTS_ENVIRONMENT += export TESTPROGRAMS="$(TESTPROGRAMS)"; TESTS_ENVIRONMENT += export check_PROGRAMS="$(check_PROGRAMS)"; -TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER="$(ENABLE_BURST_BUFFER)"; -TESTS_ENVIRONMENT += export ENABLE_NETCDF4="$(ENABLE_NETCDF4)"; + +if PNETCDF_DEBUG + TESTS_ENVIRONMENT += export PNETCDF_DEBUG=1; +endif +if ENABLE_BURST_BUFFER + TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER=1; +endif +if ENABLE_NETCDF4 + TESTS_ENVIRONMENT += export ENABLE_NETCDF4=1; +endif +if ENABLE_THREAD_SAFE + TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE=1; +endif +if MIMIC_LUSTRE + TESTS_ENVIRONMENT += export MIMIC_LUSTRE=1; +endif TESTS = $(TESTPROGRAMS) seq_runs.sh TEST_EXTENSIONS = .sh LOG_COMPILER = $(srcdir)/wrap_runs.sh SH_LOG_COMPILER = -NC_FILES = $(TESTPROGRAMS:%=$(TESTOUTDIR)/%.nc) \ - $(TESTPROGRAMS:%=$(TESTOUTDIR)/%.bb.nc) - CLEANFILES = tests.h $(M4_SRCS:.m4=.c) \ - $(TESTOUTDIR)/test.nc \ - $(TESTOUTDIR)/scratch.nc \ - $(TESTOUTDIR)/tooth-fairy.nc \ - $(TESTOUTDIR)/tst_nofill.nc.fill \ - $(TESTOUTDIR)/tst_nofill.nc.nofill \ - $(TESTOUTDIR)/tst_atts3.nc.2 \ - $(TESTOUTDIR)/tst_atts3.bb.nc.2 \ - core core.* *.gcda *.gcno *.gcov gmon.out $(NC_FILES) + $(TESTOUTDIR)/*.nc $(TESTOUTDIR)/*bb \ + core core.* *.gcda *.gcno *.gcov gmon.out EXTRA_DIST = error.h $(M4_SRCS) $(M4_HFILES) README seq_runs.sh wrap_runs.sh diff --git a/test/nc_test/nc_test.c b/test/nc_test/nc_test.c index 3d34ff9f7e..d0f9e92bf5 100644 --- a/test/nc_test/nc_test.c +++ b/test/nc_test/nc_test.c @@ -145,8 +145,12 @@ main(int argc, char *argv[]) (void) signal(SIGFPE, SIG_IGN); #endif + double timing; + MPI_Init(&argc, &argv); + timing = MPI_Wtime(); + cdf_format = 1; /* 1: CDF-1, 2: CDF-2 5: CDF-5 */ read_only = 0; /* assume may write in test dir as default */ verbose = 0; @@ -579,8 +583,11 @@ main(int argc, char *argv[]) fn_exit: MPI_Info_free(&info); + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); + if (nfailsTotal == 0) { - printf(PASS_STR); + printf(PASS_STR, timing); } else { print("\n%s: expects 0 failures ... ",argv[0]); diff --git a/test/nc_test/seq_runs.sh b/test/nc_test/seq_runs.sh index e8cd748cb8..4dcbe71629 100755 --- a/test/nc_test/seq_runs.sh +++ b/test/nc_test/seq_runs.sh @@ -34,7 +34,7 @@ rm -f ${OUTDIR}/tooth-fairy.nc ${OUTDIR}/scratch.nc ${OUTDIR}/test.nc ${TESTSEQRUN} ./nc_test -2 -d ${TESTOUTDIR} ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/test.nc -if test "${ENABLE_NETCDF4}" = 1 ; then +if test "x${ENABLE_NETCDF4}" = x1 ; then rm -f ${OUTDIR}/tooth-fairy.nc ${OUTDIR}/scratch.nc ${OUTDIR}/test.nc ${TESTSEQRUN} ./nc_test -4 -d ${TESTOUTDIR} # Validator does not support nc4 diff --git a/test/nc_test/t_nc.c b/test/nc_test/t_nc.c index c15b8d52d7..4cb6ad8942 100644 --- a/test/nc_test/t_nc.c +++ b/test/nc_test/t_nc.c @@ -112,32 +112,32 @@ static MPI_Offset sizes[] = { NC_UNLIMITED, SIZE_1 , SIZE_2 }; static const char * const dim_names[] = { "record", "ixx", "iyy"}; static int -createtestdims(int cdfid, size_t num_dims, const MPI_Offset *sizes, const char * const dim_names[]) +createtestdims(int cdfid, size_t ndims, const MPI_Offset *dim_sizes, const char * const names[]) { int dimid, err; - while(num_dims-- != 0) + while(ndims-- != 0) { - err = ncmpi_def_dim(cdfid, *dim_names++, *sizes, &dimid); ERR - sizes++; + err = ncmpi_def_dim(cdfid, *names++, *dim_sizes, &dimid); ERR + dim_sizes++; } return 0; } static int -testdims(int cdfid, size_t num_dims, MPI_Offset *sizes, const char * const dim_names[]) +testdims(int cdfid, size_t ndims, MPI_Offset *dim_sizes, const char * const names[]) { int ii, err; MPI_Offset size; char cp[NC_MAX_NAME]; - for(ii=0; (size_t) ii < num_dims; ii++, sizes++) + for(ii=0; (size_t) ii < ndims; ii++, dim_sizes++) { err = ncmpi_inq_dim(cdfid, ii, cp, &size); ERR - if( size != *sizes) + if( size != *dim_sizes) (void) fprintf(stderr, "%d: %lu != %lu\n", - ii, (unsigned long)size, (unsigned long)*sizes); - if ( size != *sizes) return 1; - if ( strcmp(cp, *dim_names++) != 0) return 1; + ii, (unsigned long)size, (unsigned long)*dim_sizes); + if ( size != *dim_sizes) return 1; + if ( strcmp(cp, *names++) != 0) return 1; } return 0; } @@ -195,11 +195,11 @@ static struct tcdfvar { #define NUM_TESTVARS 6 static int -createtestvars(int id, const struct tcdfvar *testvars, size_t count) +createtestvars(int id, const struct tcdfvar *vars, size_t count) { int ii, err; int varid; - const struct tcdfvar *vp = testvars; + const struct tcdfvar *vp = vars; for(ii = 0; (size_t) ii < count; ii++, vp++ ) { @@ -617,8 +617,12 @@ int main(int argc, char *argv[]) { char filename[256]; int rank, nprocs, cmode, err, nerrs=0; + double timing; MPI_Init(&argc, &argv); + + timing = MPI_Wtime(); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); @@ -660,10 +664,12 @@ int main(int argc, char *argv[]) sum_size); } + timing = MPI_Wtime() - timing; + MPI_Allreduce(MPI_IN_PLACE, &timing, 1, MPI_DOUBLE, MPI_MAX,MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + else printf(PASS_STR, timing); } MPI_Finalize(); diff --git a/test/nc_test/test_get.m4 b/test/nc_test/test_get.m4 index 8474d1ab03..f66c9e3016 100644 --- a/test/nc_test/test_get.m4 +++ b/test/nc_test/test_get.m4 @@ -492,13 +492,16 @@ ifelse(`$1',`uchar',`ifdef(`PNETCDF',,``#'endif')') start[j] = 0; continue; } -#ifndef RELAX_COORD_BOUND - IF (err != NC_EINVALCOORDS) /* not allowed even when edge[j]==0 */ - EXPECT_ERR(NC_EINVALCOORDS, err) -#else - IF (err != NC_NOERR) /* allowed when edge[j]==0 */ - EXPECT_ERR(NC_NOERR, err) -#endif + + if (is_relax_coord_bound()) { + IF (err != NC_NOERR) /* allowed when edge[j]==0 */ + EXPECT_ERR(NC_NOERR, err) + } + else { + IF (err != NC_EINVALCOORDS) /* not allowed even when edge[j]==0 */ + EXPECT_ERR(NC_EINVALCOORDS, err) + } + start[j] = var_shape[i][j]+1; /* should cause NC_EINVALCOORDS */ err = GetVara($1)(ncid, i, start, edge, value); IF (err != NC_EINVALCOORDS) @@ -763,13 +766,16 @@ ifdef(`PNETCDF',`dnl start[j] = 0; continue; } -#ifndef RELAX_COORD_BOUND - IF (err != NC_EINVALCOORDS) /* not allowed even when edge[j]==0 */ - EXPECT_ERR(NC_EINVALCOORDS, err) -#else - IF (err != NC_NOERR) /* allowed when edge[j]==0 */ - EXPECT_ERR(NC_NOERR, err) -#endif + + if (is_relax_coord_bound()) { + IF (err != NC_NOERR) /* allowed when edge[j]==0 */ + EXPECT_ERR(NC_NOERR, err) + } + else { + IF (err != NC_EINVALCOORDS) /* not allowed even when edge[j]==0 */ + EXPECT_ERR(NC_EINVALCOORDS, err) + } + start[j] = var_shape[i][j]+1; /* should cause NC_EINVALCOORDS */ err = GetVars($1)(ncid, i, start, edge, stride, value); IF (err != NC_EINVALCOORDS) @@ -1063,13 +1069,16 @@ ifdef(`PNETCDF',`dnl start[j] = 0; continue; } -#ifndef RELAX_COORD_BOUND - IF (err != NC_EINVALCOORDS) /* not allowed even when edge[j]==0 */ - EXPECT_ERR(NC_EINVALCOORDS, err) -#else - IF (err != NC_NOERR) /* allowed when edge[j]==0 */ - EXPECT_ERR(NC_NOERR, err) -#endif + + if (is_relax_coord_bound()) { + IF (err != NC_NOERR) /* allowed when edge[j]==0 */ + EXPECT_ERR(NC_NOERR, err) + } + else { + IF (err != NC_EINVALCOORDS) /* not allowed even when edge[j]==0 */ + EXPECT_ERR(NC_EINVALCOORDS, err) + } + start[j] = var_shape[i][j]+1; /* should cause NC_EINVALCOORDS */ err = GetVarm($1)(ncid, i, start, edge, stride, imap, value); IF (err != NC_EINVALCOORDS) diff --git a/test/nc_test/test_iget.m4 b/test/nc_test/test_iget.m4 index 2d07ace88c..d51c009cea 100644 --- a/test/nc_test/test_iget.m4 +++ b/test/nc_test/test_iget.m4 @@ -627,14 +627,18 @@ ifdef(`PNETCDF',`dnl if (var_dimid[i][j] == RECDIM) continue; /* skip record dim */ start[j] = var_shape[i][j]; err = APIFunc(iget_vara)(ncid, i, start, edge, value, 0, datatype, &reqid); -#ifndef RELAX_COORD_BOUND - IF (err != NC_EINVALCOORDS) /* not allowed even when edge[j]==0 */ - EXPECT_ERR(NC_EINVALCOORDS, err) -#else - IF (err != NC_NOERR) /* allowed when edge[j]==0 */ - EXPECT_ERR(NC_NOERR, err) -#endif - ELSE_NOK + + if (is_relax_coord_bound()) { + IF (err != NC_NOERR) /* allowed when edge[j]==0 */ + EXPECT_ERR(NC_NOERR, err) + ELSE_NOK + } + else { + IF (err != NC_EINVALCOORDS) /* not allowed even when edge[j]==0 */ + EXPECT_ERR(NC_EINVALCOORDS, err) + ELSE_NOK + } + start[j] = var_shape[i][j]+1; /* out of boundary check */ err = APIFunc(iget_vara)(ncid, i, start, edge, value, 1, datatype, &reqid); IF (err != NC_EINVALCOORDS) @@ -865,17 +869,21 @@ ifdef(`PNETCDF',`dnl start[j] = 0; continue; } -#ifndef RELAX_COORD_BOUND - IF (err != NC_EINVALCOORDS) /* not allowed even when edge[j]==0 */ - EXPECT_ERR(NC_EINVALCOORDS, err) -#else - IF (err != NC_NOERR) /* allowed when edge[j]==0 */ - EXPECT_ERR(NC_NOERR, err) - err = APIFunc(wait_all)(ncid, 1, &reqid, &st); - assert(err == st); - IF (err != NC_NOERR) EXPECT_ERR(NC_NOERR, err) -#endif - ELSE_NOK + + if (is_relax_coord_bound()) { + IF (err != NC_NOERR) /* allowed when edge[j]==0 */ + EXPECT_ERR(NC_NOERR, err) + err = APIFunc(wait_all)(ncid, 1, &reqid, &st); + assert(err == st); + IF (err != NC_NOERR) EXPECT_ERR(NC_NOERR, err) + ELSE_NOK + } + else { + IF (err != NC_EINVALCOORDS) /* not allowed even when edge[j]==0 */ + EXPECT_ERR(NC_EINVALCOORDS, err) + ELSE_NOK + } + start[j] = var_shape[i][j]+1; /* out of boundary check */ err = iGetVara($1)(ncid, i, start, edge, value, &reqid); IF (err != NC_EINVALCOORDS) @@ -1108,14 +1116,18 @@ ifdef(`PNETCDF',`dnl if (var_dimid[i][j] == RECDIM) continue; /* skip record dim */ start[j] = var_shape[i][j]; err = APIFunc(iget_vars)(ncid, i, start, edge, stride, value, 0, datatype, &reqid); -#ifndef RELAX_COORD_BOUND - IF (err != NC_EINVALCOORDS) /* not allowed even when edge[j]==0 */ - EXPECT_ERR(NC_EINVALCOORDS, err) -#else - IF (err != NC_NOERR) /* allowed when edge[j]==0 */ - EXPECT_ERR(NC_NOERR, err) -#endif - ELSE_NOK + + if (is_relax_coord_bound()) { + IF (err != NC_NOERR) /* allowed when edge[j]==0 */ + EXPECT_ERR(NC_NOERR, err) + ELSE_NOK + } + else { + IF (err != NC_EINVALCOORDS) /* not allowed even when edge[j]==0 */ + EXPECT_ERR(NC_EINVALCOORDS, err) + ELSE_NOK + } + start[j] = var_shape[i][j]+1; /* out of boundary check */ err = APIFunc(iget_vars)(ncid, i, start, edge, stride, value, 1, datatype, &reqid); IF (err != NC_EINVALCOORDS) @@ -1378,17 +1390,21 @@ ifdef(`PNETCDF',`dnl start[j] = 0; continue; } -#ifndef RELAX_COORD_BOUND - IF (err != NC_EINVALCOORDS) /* not allowed even when edge[j]==0 */ - EXPECT_ERR(NC_EINVALCOORDS, err) -#else - IF (err != NC_NOERR) /* allowed when edge[j]==0 */ - EXPECT_ERR(NC_NOERR, err) - err = APIFunc(wait_all)(ncid, 1, &reqid, &st); - assert(err == st); - IF (err != NC_NOERR) EXPECT_ERR(NC_NOERR, err) -#endif - ELSE_NOK + + if (is_relax_coord_bound()) { + IF (err != NC_NOERR) /* allowed when edge[j]==0 */ + EXPECT_ERR(NC_NOERR, err) + err = APIFunc(wait_all)(ncid, 1, &reqid, &st); + assert(err == st); + IF (err != NC_NOERR) EXPECT_ERR(NC_NOERR, err) + ELSE_NOK + } + else { + IF (err != NC_EINVALCOORDS) /* not allowed even when edge[j]==0 */ + EXPECT_ERR(NC_EINVALCOORDS, err) + ELSE_NOK + } + start[j] = var_shape[i][j]+1; /* out of boundary check */ err = iGetVars($1)(ncid, i, start, edge, stride, value, &reqid); IF (err != NC_EINVALCOORDS) @@ -1643,14 +1659,18 @@ ifdef(`PNETCDF',`dnl if (var_dimid[i][j] == RECDIM) continue; /* skip record dim */ start[j] = var_shape[i][j]; err = APIFunc(iget_varm)(ncid, i, start, edge, stride, imap, value, 0, datatype, &reqid); -#ifndef RELAX_COORD_BOUND - IF (err != NC_EINVALCOORDS) /* not allowed even when edge[j]==0 */ - EXPECT_ERR(NC_EINVALCOORDS, err) -#else - IF (err != NC_NOERR) /* allowed when edge[j]==0 */ - EXPECT_ERR(NC_NOERR, err) -#endif - ELSE_NOK + + if (is_relax_coord_bound()) { + IF (err != NC_NOERR) /* allowed when edge[j]==0 */ + EXPECT_ERR(NC_NOERR, err) + ELSE_NOK + } + else { + IF (err != NC_EINVALCOORDS) /* not allowed even when edge[j]==0 */ + EXPECT_ERR(NC_EINVALCOORDS, err) + ELSE_NOK + } + start[j] = var_shape[i][j]+1; /* out of boundary check */ err = APIFunc(iget_varm)(ncid, i, start, edge, stride, imap, value, 1, datatype, &reqid); IF (err != NC_EINVALCOORDS) @@ -1920,17 +1940,21 @@ ifdef(`PNETCDF',`dnl start[j] = 0; continue; } -#ifndef RELAX_COORD_BOUND - IF (err != NC_EINVALCOORDS) /* not allowed even when edge[j]==0 */ - EXPECT_ERR(NC_EINVALCOORDS, err) -#else - IF (err != NC_NOERR) /* allowed when edge[j]==0 */ - EXPECT_ERR(NC_NOERR, err) - err = APIFunc(wait_all)(ncid, 1, &reqid, &st); - assert(err == st); - IF (err != NC_NOERR) EXPECT_ERR(NC_NOERR, err) -#endif - ELSE_NOK + + if (is_relax_coord_bound()) { + IF (err != NC_NOERR) /* allowed when edge[j]==0 */ + EXPECT_ERR(NC_NOERR, err) + err = APIFunc(wait_all)(ncid, 1, &reqid, &st); + assert(err == st); + IF (err != NC_NOERR) EXPECT_ERR(NC_NOERR, err) + ELSE_NOK + } + else { + IF (err != NC_EINVALCOORDS) /* not allowed even when edge[j]==0 */ + EXPECT_ERR(NC_EINVALCOORDS, err) + ELSE_NOK + } + start[j] = var_shape[i][j]+1; /* out of boundary check */ err = iGetVarm($1)(ncid, i, start, edge, stride, imap, value, &reqid); IF (err != NC_EINVALCOORDS) diff --git a/test/nc_test/test_iput.m4 b/test/nc_test/test_iput.m4 index d9f0da58ce..7bd902eea9 100644 --- a/test/nc_test/test_iput.m4 +++ b/test/nc_test/test_iput.m4 @@ -76,12 +76,12 @@ define(`CheckRange3', #include "tests.h" static double -hash2nc(const nc_type var_type, int var_rank, MPI_Offset *index) +hash2nc(const nc_type xtype, int v_rank, MPI_Offset *index) { double min; double max; - switch (var_type) { + switch (xtype) { /* no type conversion will happen for NC_CHAR, use in-memory limits */ case NC_CHAR: min = CHAR_MIN; max = (double)CHAR_MAX; break; case NC_BYTE: min = X_BYTE_MIN; max = (double)X_BYTE_MAX; break; @@ -98,16 +98,16 @@ hash2nc(const nc_type var_type, int var_rank, MPI_Offset *index) return NC_EBADTYPE; } - return MAX(min, MIN(max, hash(var_type, var_rank, index))); + return MAX(min, MIN(max, hash(xtype, v_rank, index))); } static int -dbls2ncs(size_t nels, int var_type, double *inBuf, void *outBuf) +dbls2ncs(size_t nels, int xtype, double *inBuf, void *outBuf) { size_t i; char *p = (char*)outBuf; for (i=0; i] max. number of messages per test (Default: 20)') end - subroutine report_test + subroutine report_test(timing) implicit none character(LEN=1024) msg + double precision timing #include "tests.inc" if (cdf_format .EQ. 4) then @@ -63,7 +64,9 @@ subroutine report_test write(*,*) trim(PROGNAME)//' expects to see 0 failure ... '//& 'Total number of failures: ', nfailsTotal endif - call pass_fail(nfailsTotal, msg) + + timing = MPI_Wtime() - timing + call pass_fail(nfailsTotal, msg, timing) end subroutine test(name, func) @@ -88,7 +91,7 @@ subroutine test(name, func) print *, ' ' print *, ' ### ', nfails, ' FAILURES TESTING ', name, & '! Stop ... ###' - call report_test + call report_test(0) stop 2 end if end @@ -427,7 +430,12 @@ program nf90_test external test_nf90mpi_set_default_format external nc_ignorefpe + double precision timing + call MPI_INIT(err) + + timing = MPI_Wtime() + comm = MPI_COMM_WORLD call nc_ignorefpe(1) @@ -865,7 +873,7 @@ program nf90_test call MPI_Info_free(info, err) - call report_test + call report_test(timing) ! if (nfailsTotal .eq. 0) call ud_exit(0) call ud_exit(0) diff --git a/test/nf90_test/seq_runs.sh b/test/nf90_test/seq_runs.sh index ade7a6ad3a..c862fce875 100755 --- a/test/nf90_test/seq_runs.sh +++ b/test/nf90_test/seq_runs.sh @@ -36,7 +36,7 @@ rm -f ${OUTDIR}/tooth-fairy.nc ${TESTSEQRUN} ./nf90_test -5 -d ${TESTOUTDIR} ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/test.nc -if test "${ENABLE_NETCDF4}" = 1 ; then +if test "x${ENABLE_NETCDF4}" = x1 ; then rm -f ${OUTDIR}/test.nc rm -f ${OUTDIR}/scratch.nc rm -f ${OUTDIR}/tooth-fairy.nc diff --git a/test/nf90_test/test_get.m4 b/test/nf90_test/test_get.m4 index 75b8e36369..56f2b37ea7 100644 --- a/test/nf90_test/test_get.m4 +++ b/test/nf90_test/test_get.m4 @@ -335,7 +335,7 @@ define([TEST_NFMPI_GET_VARA],[dnl integer(kind=MPI_OFFSET_KIND) edge(MAX_RANK) integer(kind=MPI_OFFSET_KIND) index(MAX_RANK) integer(kind=MPI_OFFSET_KIND) mid(MAX_RANK) - logical canConvert + logical canConvert, relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, MAX_NELS) doubleprecision expect(MAX_NELS) doubleprecision val @@ -394,13 +394,14 @@ define([TEST_NFMPI_GET_VARA],[dnl if (err .ne. NF90_ECHAR) & call errore('wrong type: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF90_NOERR) & - call error(ErrFunc(err)) -#else - if (err .ne. NF90_EINVALCOORDS) & - call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF90_NOERR) & + call error(ErrFunc(err)) + else + if (err .ne. NF90_EINVALCOORDS) & + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = GetVar(ncid, i, value, start, edge) @@ -545,7 +546,7 @@ define([TEST_NFMPI_GET_VARS],dnl integer(kind=MPI_OFFSET_KIND) count(MAX_RANK) integer(kind=MPI_OFFSET_KIND) sstride(MAX_RANK) integer(kind=MPI_OFFSET_KIND) stride(MAX_RANK) - logical canConvert + logical canConvert, relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, MAX_NELS) doubleprecision expect(MAX_NELS) doubleprecision val @@ -625,13 +626,14 @@ define([TEST_NFMPI_GET_VARS],dnl if (err .ne. NF90_ECHAR) & call errore('wrong type: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF90_NOERR) & - call error(ErrFunc(err)) -#else - if (err .ne. NF90_EINVALCOORDS) & - call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF90_NOERR) & + call error(ErrFunc(err)) + else + if (err .ne. NF90_EINVALCOORDS) & + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = GetVar(ncid, i, value, start, edge, stride) @@ -818,7 +820,7 @@ define([TEST_NFMPI_GET_VARM],dnl integer(kind=MPI_OFFSET_KIND) sstride(MAX_RANK) integer(kind=MPI_OFFSET_KIND) stride(MAX_RANK) integer(kind=MPI_OFFSET_KIND) imap(MAX_RANK) - logical canConvert + logical canConvert, relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, MAX_NELS) doubleprecision expect(MAX_NELS) doubleprecision val @@ -904,13 +906,14 @@ define([TEST_NFMPI_GET_VARM],dnl if (err .ne. NF90_ECHAR) & call errore('wrong type: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF90_NOERR) & - call error(ErrFunc(err)) -#else - if (err .ne. NF90_EINVALCOORDS) & - call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF90_NOERR) & + call error(ErrFunc(err)) + else + if (err .ne. NF90_EINVALCOORDS) & + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = GetVar(ncid, i, value, start, edge, stride, imap) diff --git a/test/nf90_test/test_iget.m4 b/test/nf90_test/test_iget.m4 index fff6559272..5e2d4eedde 100644 --- a/test/nf90_test/test_iget.m4 +++ b/test/nf90_test/test_iget.m4 @@ -313,7 +313,7 @@ define([TEST_NFMPI_IGET_VARA],[dnl integer(kind=MPI_OFFSET_KIND) edge(MAX_RANK) integer(kind=MPI_OFFSET_KIND) index(MAX_RANK) integer(kind=MPI_OFFSET_KIND) mid(MAX_RANK) - logical canConvert + logical canConvert, relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision expect(MAX_NELS) doubleprecision val @@ -383,13 +383,14 @@ define([TEST_NFMPI_IGET_VARA],[dnl if (err .ne. NF90_ECHAR) & call errore('wrong type: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF90_NOERR) & - call error(ErrFunc(err)) -#else - if (err .ne. NF90_EINVALCOORDS) & - call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF90_NOERR) & + call error(ErrFunc(err)) + else + if (err .ne. NF90_EINVALCOORDS) & + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = GetVar(ncid, i, value,reqid(1), start, edge) @@ -542,7 +543,7 @@ define([TEST_NFMPI_IGET_VARS],dnl integer(kind=MPI_OFFSET_KIND) count(MAX_RANK) integer(kind=MPI_OFFSET_KIND) sstride(MAX_RANK) integer(kind=MPI_OFFSET_KIND) stride(MAX_RANK) - logical canConvert + logical canConvert, relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision expect(MAX_NELS) doubleprecision val @@ -623,13 +624,14 @@ define([TEST_NFMPI_IGET_VARS],dnl if (err .ne. NF90_ECHAR) & call errore('wrong type: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF90_NOERR) & - call error(ErrFunc(err)) -#else - if (err .ne. NF90_EINVALCOORDS) & - call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF90_NOERR) & + call error(ErrFunc(err)) + else + if (err .ne. NF90_EINVALCOORDS) & + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = GetVar(ncid, i, value,reqid(1), start, edge, stride) @@ -813,7 +815,7 @@ define([TEST_NFMPI_IGET_VARM],dnl integer(kind=MPI_OFFSET_KIND) sstride(MAX_RANK) integer(kind=MPI_OFFSET_KIND) stride(MAX_RANK) integer(kind=MPI_OFFSET_KIND) imap(MAX_RANK) - logical canConvert + logical canConvert, relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision expect(MAX_NELS) doubleprecision val @@ -895,13 +897,14 @@ define([TEST_NFMPI_IGET_VARM],dnl if (err .ne. NF90_ECHAR) & call errore('wrong type: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF90_NOERR) & - call error(ErrFunc(err)) -#else - if (err .ne. NF90_EINVALCOORDS) & - call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF90_NOERR) & + call error(ErrFunc(err)) + else + if (err .ne. NF90_EINVALCOORDS) & + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = GetVar(ncid, i, value,reqid(1), start, edge, stride, imap) diff --git a/test/nf90_test/test_iput.m4 b/test/nf90_test/test_iput.m4 index c65dbde787..3143cfd7ad 100644 --- a/test/nf90_test/test_iput.m4 +++ b/test/nf90_test/test_iput.m4 @@ -486,6 +486,7 @@ define([TEST_NFMPI_IPUT_VARA],dnl integer(kind=MPI_OFFSET_KIND) index(MAX_RANK) logical canConvert !/* Both text or both numeric */ logical allInExtRange !/* all values within external range? */ + logical relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision val integer ud_shift @@ -577,13 +578,14 @@ define([TEST_NFMPI_IPUT_VARA],dnl if (err .ne. NF90_ECHAR) & call errore('conversion: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF90_NOERR) & - call error(ErrFunc(err)) -#else - if (err .ne. NF90_EINVALCOORDS) & - call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF90_NOERR) & + call error(ErrFunc(err)) + else + if (err .ne. NF90_EINVALCOORDS) & + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = iPutVar(ncid, i, value,reqid(1), start, edge) @@ -723,6 +725,7 @@ define([TEST_NFMPI_IPUT_VARS],dnl integer(kind=MPI_OFFSET_KIND) stride(MAX_RANK) logical canConvert !/* Both text or both numeric */ logical allInExtRange !/* all values within external range? */ + logical relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision val integer ud_shift @@ -825,13 +828,14 @@ define([TEST_NFMPI_IPUT_VARS],dnl if (err .ne. NF90_ECHAR) & call errore('conversion: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF90_NOERR) & - call error(ErrFunc(err)) -#else - if (err .ne. NF90_EINVALCOORDS) & - call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF90_NOERR) & + call error(ErrFunc(err)) + else + if (err .ne. NF90_EINVALCOORDS) & + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = iPutVar(ncid, i, value,reqid(1), start, edge, stride) @@ -1003,6 +1007,7 @@ define([TEST_NFMPI_IPUT_VARM],dnl integer(kind=MPI_OFFSET_KIND) imap(MAX_RANK) logical canConvert !/* Both text or both numeric */ logical allInExtRange !/* all values within external range? */ + logical relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision val integer ud_shift @@ -1106,13 +1111,14 @@ define([TEST_NFMPI_IPUT_VARM],dnl if (err .ne. NF90_ECHAR) & call errore('conversion: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF90_NOERR) & - call error(ErrFunc(err)) -#else - if (err .ne. NF90_EINVALCOORDS) & - call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF90_NOERR) & + call error(ErrFunc(err)) + else + if (err .ne. NF90_EINVALCOORDS) & + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = iPutVar(ncid, i, value,reqid(1), start, edge, stride, imap) diff --git a/test/nf90_test/test_put.m4 b/test/nf90_test/test_put.m4 index 857934c013..5e6b5671a9 100644 --- a/test/nf90_test/test_put.m4 +++ b/test/nf90_test/test_put.m4 @@ -751,6 +751,7 @@ define([TEST_NFMPI_PUT_VARA],dnl integer(kind=MPI_OFFSET_KIND) index(MAX_RANK) logical canConvert !/* Both text or both numeric */ logical allInExtRange !/* all values within external range? */ + logical relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision val integer ud_shift @@ -859,13 +860,14 @@ define([TEST_NFMPI_PUT_VARA],dnl if (err .ne. NF90_ECHAR) & call errore('wrong type: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF90_NOERR) & - call errore('PutVarAll($1): ',err) -#else - if (err .ne. NF90_EINVALCOORDS) & - call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF90_NOERR) & + call errore('PutVarAll($1): ',err) + else + if (err .ne. NF90_EINVALCOORDS) & + call errore('bad start: ', err) + endif endif ! make start() way out of bounds start(j) = var_shape(j,i) + 2 @@ -995,6 +997,7 @@ define([TEST_NFMPI_PUT_VARS],dnl integer(kind=MPI_OFFSET_KIND) stride(MAX_RANK) logical canConvert !/* Both text or both numeric */ logical allInExtRange !/* all values within external range? */ + logical relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision val integer ud_shift @@ -1115,13 +1118,14 @@ define([TEST_NFMPI_PUT_VARS],dnl if (err .ne. NF90_ECHAR) & call errore('wrong type: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF90_NOERR) & - call errore('PutVarAll($1): ',err) -#else - if (err .ne. NF90_EINVALCOORDS) & - call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF90_NOERR) & + call errore('PutVarAll($1): ',err) + else + if (err .ne. NF90_EINVALCOORDS) & + call errore('bad start: ', err) + endif endif ! make start() way out of bounds start(j) = var_shape(j,i) + 2 @@ -1284,6 +1288,7 @@ define([TEST_NFMPI_PUT_VARM],dnl integer(kind=MPI_OFFSET_KIND) imap(MAX_RANK) logical canConvert !/* Both text or both numeric */ logical allInExtRange !/* all values within external range? */ + logical relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision val integer ud_shift @@ -1410,13 +1415,14 @@ define([TEST_NFMPI_PUT_VARM],dnl if (err .ne. NF90_ECHAR) & call errore('wrong type: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF90_NOERR) & - call errore('PutVarAll($1): ',err) -#else - if (err .ne. NF90_EINVALCOORDS) & - call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF90_NOERR) & + call errore('PutVarAll($1): ',err) + else + if (err .ne. NF90_EINVALCOORDS) & + call errore('bad start: ', err) + endif endif ! make start() way out of bounds start(j) = var_shape(j,i) + 2 diff --git a/test/nf_test/Makefile.am b/test/nf_test/Makefile.am index 9532b07185..1754e3288d 100644 --- a/test/nf_test/Makefile.am +++ b/test/nf_test/Makefile.am @@ -89,11 +89,24 @@ TESTS_ENVIRONMENT += export srcdir="$(srcdir)"; TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; TESTS_ENVIRONMENT += export TESTMPIRUN="$(TESTMPIRUN)"; -TESTS_ENVIRONMENT += export PNETCDF_DEBUG="$(PNETCDF_DEBUG)"; TESTS_ENVIRONMENT += export TESTPROGRAMS="$(TESTPROGRAMS)"; TESTS_ENVIRONMENT += export check_PROGRAMS="$(check_PROGRAMS)"; -TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER="$(ENABLE_BURST_BUFFER)"; -TESTS_ENVIRONMENT += export ENABLE_NETCDF4="$(ENABLE_NETCDF4)"; + +if PNETCDF_DEBUG + TESTS_ENVIRONMENT += export PNETCDF_DEBUG=1; +endif +if ENABLE_BURST_BUFFER + TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER=1; +endif +if ENABLE_NETCDF4 + TESTS_ENVIRONMENT += export ENABLE_NETCDF4=1; +endif +if ENABLE_THREAD_SAFE + TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE=1; +endif +if MIMIC_LUSTRE + TESTS_ENVIRONMENT += export MIMIC_LUSTRE=1; +endif TESTS = seq_runs.sh TEST_EXTENSIONS = .sh @@ -104,9 +117,7 @@ nodist_nf_test_SOURCES = $(M4SRCS:.m4=.F) $(M4SRCS:.m4=.F): Makefile CLEANFILES = $(M4SRCS:.m4=.F) \ - $(TESTOUTDIR)/scratch.nc \ - $(TESTOUTDIR)/test.nc \ - $(TESTOUTDIR)/tooth-fairy.nc \ + $(TESTOUTDIR)/*.nc $(TESTOUTDIR)/*bb \ core core.* *.gcda *.gcno *.gcov gmon.out EXTRA_DIST = $(M4SRCS) $(HFILES) README seq_runs.sh diff --git a/test/nf_test/nf_test.F b/test/nf_test/nf_test.F index 45b344c52c..64ce73473c 100644 --- a/test/nf_test/nf_test.F +++ b/test/nf_test/nf_test.F @@ -37,9 +37,10 @@ subroutine usage() end - subroutine report_test + subroutine report_test(timing) implicit none character(LEN=1024) msg + double precision timing #include "tests.inc" integer MY_LEN_TRIM @@ -57,7 +58,7 @@ subroutine report_test + ' expects to see 0 failure ... '// + 'Total number of failures: ', nfailsTotal endif - call pass_fail(nfailsTotal, msg) + call pass_fail(nfailsTotal, msg, timing) end subroutine test(name, func) @@ -422,7 +423,12 @@ program nf_test external test_nfmpi_set_default_format external nc_ignorefpe + double precision timing + call MPI_INIT(err) + + timing = MPI_Wtime() + comm = MPI_COMM_WORLD call nc_ignorefpe(1) @@ -865,7 +871,8 @@ program nf_test call MPI_Info_free(info, err) - call report_test + timing = MPI_Wtime() - timing + call report_test(timing) ! if (nfailsTotal .eq. 0) call ud_exit(0) call ud_exit(0) diff --git a/test/nf_test/seq_runs.sh b/test/nf_test/seq_runs.sh index f76a7a28c8..08c5e4bdc0 100755 --- a/test/nf_test/seq_runs.sh +++ b/test/nf_test/seq_runs.sh @@ -30,7 +30,7 @@ rm -f ${OUTDIR}/tooth-fairy.nc ${TESTSEQRUN} ./nf_test -2 -d ${TESTOUTDIR} ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/test.nc -if test "${ENABLE_NETCDF4}" = 1 ; then +if test "x${ENABLE_NETCDF4}" = x1 ; then rm -f ${OUTDIR}/test.nc rm -f ${OUTDIR}/scratch.nc rm -f ${OUTDIR}/tooth-fairy.nc diff --git a/test/nf_test/test_get.m4 b/test/nf_test/test_get.m4 index 7ae87007ab..acfc9ab399 100644 --- a/test/nf_test/test_get.m4 +++ b/test/nf_test/test_get.m4 @@ -320,7 +320,7 @@ define([TEST_NFMPI_GET_VARA],[dnl integer*8 edge(MAX_RANK) integer*8 index(MAX_RANK) integer*8 mid(MAX_RANK) - logical canConvert + logical canConvert, relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision expect(MAX_NELS) doubleprecision val @@ -386,13 +386,14 @@ C /* there is nothing to get (edge(j).eq.0) */ if (err .NE. NF_ECHAR) + call errore('wrong type: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .NE. NF_NOERR) - + call error(ErrFunc(err)) -#else - if (err .NE. NF_EINVALCOORDS) - + call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .NE. NF_NOERR) + + call error(ErrFunc(err)) + else + if (err .NE. NF_EINVALCOORDS) + + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = GetVara($1)(ncid, i, @@ -548,7 +549,7 @@ define([TEST_NFMPI_GET_VARS],dnl integer*8 count(MAX_RANK) integer*8 sstride(MAX_RANK) integer*8 stride(MAX_RANK) - logical canConvert + logical canConvert, relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision expect(MAX_NELS) doubleprecision val @@ -636,13 +637,14 @@ C /* there is nothing to get (edge(j).eq.0) */ if (err .NE. NF_ECHAR) + call errore('wrong type: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .NE. NF_NOERR) - + call error(ErrFunc(err)) -#else - if (err .NE. NF_EINVALCOORDS) - + call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .NE. NF_NOERR) + + call error(ErrFunc(err)) + else + if (err .NE. NF_EINVALCOORDS) + + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = GetVars($1)(ncid, i, @@ -832,7 +834,7 @@ define([TEST_NFMPI_GET_VARM],dnl integer*8 sstride(MAX_RANK) integer*8 stride(MAX_RANK) integer*8 imap(MAX_RANK) - logical canConvert + logical canConvert, relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision expect(MAX_NELS) doubleprecision val @@ -926,13 +928,14 @@ C /* there is nothing to get (edge(j).eq.0) */ if (err .NE. NF_ECHAR) + call errore('wrong type: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .NE. NF_NOERR) - + call error(ErrFunc(err)) -#else - if (err .NE. NF_EINVALCOORDS) - + call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .NE. NF_NOERR) + + call error(ErrFunc(err)) + else + if (err .NE. NF_EINVALCOORDS) + + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = GetVarm($1)(ncid, i, diff --git a/test/nf_test/test_iget.m4 b/test/nf_test/test_iget.m4 index d188c8d5fe..69c399d0e0 100644 --- a/test/nf_test/test_iget.m4 +++ b/test/nf_test/test_iget.m4 @@ -322,7 +322,7 @@ define([TEST_NFMPI_IGET_VARA],[dnl integer*8 edge(MAX_RANK) integer*8 index(MAX_RANK) integer*8 mid(MAX_RANK) - logical canConvert + logical canConvert, relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision expect(MAX_NELS) doubleprecision val @@ -399,13 +399,14 @@ C /* there is nothing to get (edge(j).eq.0) */ if (err .NE. NF_ECHAR) + call errore('wrong type: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF_NOERR) - + call error(ErrFunc(err)) -#else - if (err .ne. NF_EINVALCOORDS) - + call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF_NOERR) + + call error(ErrFunc(err)) + else + if (err .ne. NF_EINVALCOORDS) + + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = iGetVara($1)(ncid, i, @@ -559,7 +560,7 @@ define([TEST_NFMPI_IGET_VARS],dnl integer*8 count(MAX_RANK) integer*8 sstride(MAX_RANK) integer*8 stride(MAX_RANK) - logical canConvert + logical canConvert, relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision expect(MAX_NELS) doubleprecision val @@ -648,13 +649,14 @@ C /* there is nothing to get (edge(j).eq.0) */ if (err .NE. NF_ECHAR) + call errore('wrong type: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF_NOERR) - + call error(ErrFunc(err)) -#else - if (err .ne. NF_EINVALCOORDS) - + call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF_NOERR) + + call error(ErrFunc(err)) + else + if (err .ne. NF_EINVALCOORDS) + + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = iGetVars($1)(ncid, i, @@ -843,7 +845,7 @@ define([TEST_NFMPI_IGET_VARM],dnl integer*8 sstride(MAX_RANK) integer*8 stride(MAX_RANK) integer*8 imap(MAX_RANK) - logical canConvert + logical canConvert, relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision expect(MAX_NELS) doubleprecision val @@ -933,13 +935,14 @@ C /* there is nothing to get (edge(j).eq.0) */ if (err .NE. NF_ECHAR) + call errore('wrong type: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF_NOERR) - + call error(ErrFunc(err)) -#else - if (err .ne. NF_EINVALCOORDS) - + call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF_NOERR) + + call error(ErrFunc(err)) + else + if (err .ne. NF_EINVALCOORDS) + + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = iGetVarm($1)(ncid, i, diff --git a/test/nf_test/test_iput.m4 b/test/nf_test/test_iput.m4 index 3fa39f3bb6..d364cae4f2 100644 --- a/test/nf_test/test_iput.m4 +++ b/test/nf_test/test_iput.m4 @@ -501,6 +501,7 @@ define([TEST_NFMPI_IPUT_VARA],dnl integer*8 index(MAX_RANK) logical canConvert !/* Both text or both numeric */ logical allInExtRange !/* all values within external range? */ + logical relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision val integer ud_shift @@ -599,13 +600,14 @@ C /* Check correct error returned even when nothing to put */ if (err .ne. NF_ECHAR) + call errore('wrong type: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF_NOERR) - + call error(ErrFunc(err)) -#else - if (err .ne. NF_EINVALCOORDS) - + call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF_NOERR) + + call error(ErrFunc(err)) + else + if (err .ne. NF_EINVALCOORDS) + + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = iPutVara($1)(ncid, i, @@ -751,6 +753,7 @@ define([TEST_NFMPI_IPUT_VARS],dnl integer*8 stride(MAX_RANK) logical canConvert !/* Both text or both numeric */ logical allInExtRange !/* all values within external range? */ + logical relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision val integer ud_shift @@ -861,13 +864,14 @@ C /* Check correct error returned even when nothing to put */ if (err .ne. NF_ECHAR) + call errore('wrong type: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF_NOERR) - + call error(ErrFunc(err)) -#else - if (err .ne. NF_EINVALCOORDS) - + call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF_NOERR) + + call error(ErrFunc(err)) + else + if (err .ne. NF_EINVALCOORDS) + + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = iPutVars($1)(ncid, i, @@ -1045,6 +1049,7 @@ define([TEST_NFMPI_IPUT_VARM],dnl integer*8 imap(MAX_RANK) logical canConvert !/* Both text or both numeric */ logical allInExtRange !/* all values within external range? */ + logical relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision val integer ud_shift @@ -1156,13 +1161,14 @@ C /* Check correct error returned even when nothing to put */ if (err .ne. NF_ECHAR) + call errore('wrong type: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF_NOERR) - + call error(ErrFunc(err)) -#else - if (err .ne. NF_EINVALCOORDS) - + call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF_NOERR) + + call error(ErrFunc(err)) + else + if (err .ne. NF_EINVALCOORDS) + + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = iPutVarm($1)(ncid, i, diff --git a/test/nf_test/test_put.m4 b/test/nf_test/test_put.m4 index 7ea68ff073..0ca97e5f4f 100644 --- a/test/nf_test/test_put.m4 +++ b/test/nf_test/test_put.m4 @@ -729,6 +729,7 @@ define([TEST_NFMPI_PUT_VARA],dnl integer*8 index(MAX_RANK) logical canConvert !/* Both text or both numeric */ logical allInExtRange !/* all values within external range? */ + logical relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision val integer ud_shift @@ -843,13 +844,14 @@ C /* Check correct error returned even when nothing to put */ if (err .ne. NF_ECHAR) + call errore('conversion: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF_NOERR) - + call errore('expect no error: ', err) -#else - if (err .ne. NF_EINVALCOORDS) - + call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF_NOERR) + + call errore('expect no error: ', err) + else + if (err .ne. NF_EINVALCOORDS) + + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = PutVaraAll($1)(ncid, i, @@ -980,6 +982,7 @@ define([TEST_NFMPI_PUT_VARS],dnl integer*8 stride(MAX_RANK) logical canConvert !/* Both text or both numeric */ logical allInExtRange !/* all values within external range? */ + logical relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision val integer ud_shift @@ -1099,13 +1102,14 @@ C /* Check correct error returned even when nothing to put */ if (err .ne. NF_ECHAR) + call errore('conversion: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF_NOERR) - + call errore('expect no error: ', err) -#else - if (err .ne. NF_EINVALCOORDS) - + call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF_NOERR) + + call errore('expect no error: ', err) + else + if (err .ne. NF_EINVALCOORDS) + + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = PutVarsAll($1)(ncid, i, @@ -1269,6 +1273,7 @@ define([TEST_NFMPI_PUT_VARM],dnl integer*8 imap(MAX_RANK) logical canConvert !/* Both text or both numeric */ logical allInExtRange !/* all values within external range? */ + logical relax_coord_bound, relax_coord_bound_f DATATYPE($1, value, (MAX_NELS)) doubleprecision val integer ud_shift @@ -1388,13 +1393,14 @@ C /* Check correct error returned even when nothing to put */ if (err .ne. NF_ECHAR) + call errore('conversion: ', err) else -#ifdef RELAX_COORD_BOUND - if (err .ne. NF_NOERR) - + call errore('expect no error: ', err) -#else - if (err .ne. NF_EINVALCOORDS) - + call errore('bad start: ', err) -#endif + relax_coord_bound = relax_coord_bound_f() + if (relax_coord_bound) then + if (err .ne. NF_NOERR) + + call errore('expect no error: ', err) + else + if (err .ne. NF_EINVALCOORDS) + + call errore('bad start: ', err) + endif endif start(j) = var_shape(j,i) + 2 err = PutVarmAll($1)(ncid, i, diff --git a/test/nonblocking/Makefile.am b/test/nonblocking/Makefile.am index 568154932d..1d76fbcb7e 100644 --- a/test/nonblocking/Makefile.am +++ b/test/nonblocking/Makefile.am @@ -50,19 +50,19 @@ if DECL_MPI_OFFSET AM_FCFLAGS += $(FC_DEFINE)HAVE_DECL_MPI_OFFSET endif -TESTPROGRAMS = test_bput \ - interleaved \ - i_varn_int64 \ - flexible_bput \ - wait_after_indep \ - req_all \ - i_varn_indef \ - large_num_reqs - -M4_SRCS = bput_varn.m4 \ - column_wise.m4 - -TESTPROGRAMS += $(M4_SRCS:.m4=) +M4_SRCS = bput_varn.m4 \ + column_wise.m4 + +check_PROGRAMS = test_bput \ + interleaved \ + i_varn_int64 \ + flexible_bput \ + wait_after_indep \ + req_all \ + i_varn_indef \ + large_num_reqs \ + mcoll_perf \ + $(M4_SRCS:.m4=) $(M4_SRCS:.m4=.c): Makefile @@ -80,24 +80,22 @@ nodist_bput_varn_SOURCES = bput_varn.c nodist_column_wise_SOURCES = column_wise.c if HAS_FORTRAN - TESTPROGRAMS += mcoll_testf77 \ - test_bputf77 + check_PROGRAMS += mcoll_testf77 \ + test_bputf77 mcoll_testf77_SOURCES = mcoll_testf77.f test_bputf77_SOURCES = test_bputf77.f if HAVE_MPI_MOD - TESTPROGRAMS += mcoll_testf \ - test_bputf + check_PROGRAMS += mcoll_testf \ + test_bputf mcoll_testf_SOURCES = mcoll_testf.f90 test_bputf_SOURCES = test_bputf.f90 endif endif -check_PROGRAMS = $(TESTPROGRAMS) mcoll_perf - # autimake 1.11.3 has not yet implemented AM_TESTS_ENVIRONMENT # For newer versions, we can use AM_TESTS_ENVIRONMENT instead -# AM_TESTS_ENVIRONMENT = TESTPROGRAMS="$(TESTPROGRAMS)" ; export TESTPROGRAMS; +# AM_TESTS_ENVIRONMENT = check_PROGRAMS="$(check_PROGRAMS)" ; export check_PROGRAMS; # AM_TESTS_ENVIRONMENT += TESTSEQRUN="$(TESTSEQRUN)" ; export TESTSEQRUN; # AM_TESTS_ENVIRONMENT += TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)" ; export TESTOUTDIR; TESTS_ENVIRONMENT = export SED="$(SED)"; @@ -105,22 +103,29 @@ TESTS_ENVIRONMENT += export srcdir="$(srcdir)"; TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; TESTS_ENVIRONMENT += export TESTMPIRUN="$(TESTMPIRUN)"; -TESTS_ENVIRONMENT += export PNETCDF_DEBUG="$(PNETCDF_DEBUG)"; -TESTS_ENVIRONMENT += export TESTPROGRAMS="$(TESTPROGRAMS)"; TESTS_ENVIRONMENT += export check_PROGRAMS="$(check_PROGRAMS)"; -TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER="$(ENABLE_BURST_BUFFER)"; -TESTS = $(TESTPROGRAMS) seq_runs.sh +if PNETCDF_DEBUG + TESTS_ENVIRONMENT += export PNETCDF_DEBUG=1; +endif +if ENABLE_BURST_BUFFER + TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER=1; +endif +if ENABLE_THREAD_SAFE + TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE=1; +endif +if MIMIC_LUSTRE + TESTS_ENVIRONMENT += export MIMIC_LUSTRE=1; +endif + +TESTS = $(check_PROGRAMS) TEST_EXTENSIONS = .sh -LOG_COMPILER = $(srcdir)/wrap_runs.sh +LOG_COMPILER = $(srcdir)/seq_runs.sh SH_LOG_COMPILER = -NC_FILES = $(TESTPROGRAMS:%=$(TESTOUTDIR)/%.nc) \ - $(TESTPROGRAMS:%=$(TESTOUTDIR)/%.bb.nc) - -CLEANFILES = $(M4_SRCS:.m4=.c) core core.* *.gcda *.gcno *.gcov gmon.out \ - $(TESTOUTDIR)/testfile*.nc $(NC_FILES) \ - $(TESTOUTDIR)/mcoll_perf.nc.* $(TESTOUTDIR)/mcoll_perf.bb.nc.* +CLEANFILES = $(M4_SRCS:.m4=.c) \ + $(TESTOUTDIR)/*.nc $(TESTOUTDIR)/*bb \ + core core.* *.gcda *.gcno *.gcov gmon.out EXTRA_DIST = $(M4_SRCS) seq_runs.sh wrap_runs.sh parallel_run.sh diff --git a/test/nonblocking/bput_varn.m4 b/test/nonblocking/bput_varn.m4 index 8cc4a9541c..91a8bb563e 100644 --- a/test/nonblocking/bput_varn.m4 +++ b/test/nonblocking/bput_varn.m4 @@ -35,28 +35,28 @@ dnl * data: * * var0 = - * 13, 13, 13, 11, 11, 10, 10, 12, 11, 11, - * 10, 12, 12, 12, 13, 11, 11, 12, 12, 12, - * 11, 11, 12, 13, 13, 13, 10, 10, 11, 11, - * 10, 10, 10, 12, 11, 11, 11, 13, 13, 13 ; + * 3, 3, 3, 1, 1, 0, 0, 2, 1, 1, + * 0, 2, 2, 2, 3, 1, 1, 2, 2, 2, + * 1, 1, 2, 3, 3, 3, 0, 0, 1, 1, + * 0, 0, 0, 2, 1, 1, 1, 3, 3, 3 ; * var1 = - * 12, 12, 12, 10, 10, 13, 13, 11, 10, 10, - * 13, 11, 11, 11, 12, 10, 10, 11, 11, 11, - * 10, 10, 11, 12, 12, 12, 13, 13, 10, 10, - * 13, 13, 13, 11, 10, 10, 10, 12, 12, 12 ; + * 2, 2, 2, 0, 0, 3, 3, 1, 0, 0, + * 3, 1, 1, 1, 2, 0, 0, 1, 1, 1, + * 0, 0, 1, 2, 2, 2, 3, 3, 0, 0, + * 3, 3, 3, 1, 0, 0, 0, 2, 2, 2 ; * * var2 = - * 11, 11, 11, 13, 13, 12, 12, 10, 13, 13, - * 12, 10, 10, 10, 11, 13, 13, 10, 10, 10, - * 13, 13, 10, 11, 11, 11, 12, 12, 13, 13, - * 12, 12, 12, 10, 13, 13, 13, 11, 11, 11 ; + * 1, 1, 1, 3, 3, 2, 2, 0, 3, 3, + * 2, 0, 0, 0, 1, 3, 3, 0, 0, 0, + * 3, 3, 0, 1, 1, 1, 2, 2, 3, 3, + * 2, 2, 2, 0, 3, 3, 3, 1, 1, 1 ; * * var3 = - * 10, 10, 10, 12, 12, 11, 11, 13, 12, 12, - * 11, 13, 13, 13, 10, 12, 12, 13, 13, 13, - * 12, 12, 13, 10, 10, 10, 11, 11, 12, 12, - * 11, 11, 11, 13, 12, 12, 12, 10, 10, 10 ; + * 0, 0, 0, 2, 2, 1, 1, 3, 2, 2, + * 1, 3, 3, 3, 0, 2, 2, 3, 3, 3, + * 2, 2, 3, 0, 0, 0, 1, 1, 2, 2, + * 1, 1, 1, 3, 2, 2, 2, 0, 0, 0 ; * } * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -68,6 +68,8 @@ dnl #include #include /* strcpy() */ #include /* basename() */ +#include /* getopt() */ + #include #include @@ -76,6 +78,7 @@ dnl #define NLOOPS 4 #define MAX_NREQS 6 #define NDIMS 2 +#define NVARS 4 #define NY 4 #define NX 10 @@ -92,6 +95,7 @@ include(`utils.m4')dnl printf("Error at line %d in %s: err[%d] %s\n", __LINE__, __FILE__, _i, \ ncmpi_strerrno((a)[_i])); \ nerrs++; \ + goto err_out; \ } \ } \ } @@ -162,70 +166,86 @@ void permute(MPI_Offset *a, MPI_Offset *b) define(`TEST_BPUT_VARN',`dnl static -int clear_file_contents_$1(int ncid, int *varid) +int clear_file_contents_$1(int ncid, int *varid, int coll_io) { - int i, err, nerrs=0, rank; + int i, err, nerrs=0; $1 *w_buffer = ($1*) malloc(sizeof($1) * NY*NX); - for (i=0; i 4) MPI_Barrier(MPI_COMM_WORLD); - for (i=0; i<4; i++) { - for (j=0; j= 10+nprocs) continue; - if (r_buffer[j] != expected[i][j]) { - printf("Expected read $1 buf[%d][%d]=%d, but got %d\n", - i,j,(int)expected[i][j],(int)r_buffer[j]); + $1 exp = (expected[i][j] >= nprocs) ? fillv : expected[i][j]; + if (r_buffer[j] != exp) { + char var_name[16]; + ncmpi_inq_varname(ncid, varid[i], var_name); + printf("Expected var %s read $1 buf[%d][%d]=%d, but got %d\n", + var_name,i,j,(int)exp,(int)r_buffer[j]); nerrs++; - goto fn_exit; + break; } } + MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); + if (nerrs > 0) goto fn_exit; } fn_exit: free(r_buffer); @@ -233,10 +253,10 @@ fn_exit: } static int -test_bput_varn_$1(char *filename, int cdf) +test_bput_varn_$1(const char *out_path, int format, int coll_io, MPI_Info info) { - int i, j, k, rank, err, nerrs=0, bb_enabled; - int ncid, cmode, varid[NLOOPS], dimid[2], nreqs, reqs[NLOOPS], sts[NLOOPS]; + int i, j, k, rank, err, nerrs=0, bb_enabled, fmt; + int ncid, varid[NLOOPS], dimid[2], nreqs, reqs[NLOOPS], sts[NLOOPS]; int req_lens[NLOOPS], my_nsegs[NLOOPS], num_segs[NLOOPS] = {4, 6, 5, 4}; $1 *buffer[NLOOPS]; @@ -276,13 +296,12 @@ test_bput_varn_$1(char *filename, int cdf) MPI_Comm_rank(MPI_COMM_WORLD, &rank); + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER; - if (cdf == NC_FORMAT_CDF2) - cmode |= NC_64BIT_OFFSET; - else if (cdf == NC_FORMAT_CDF5) - cmode |= NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR { @@ -299,15 +318,6 @@ test_bput_varn_$1(char *filename, int cdf) MPI_Info_free(&infoused); } - /* create a global array of size NY * NX */ - err = ncmpi_def_dim(ncid, "Y", NY, &dimid[0]); CHECK_ERR - err = ncmpi_def_dim(ncid, "X", NX, &dimid[1]); CHECK_ERR - err = ncmpi_def_var(ncid, "var0", NC_TYPE($1), NDIMS, dimid, &varid[0]); CHECK_ERR - err = ncmpi_def_var(ncid, "var1", NC_TYPE($1), NDIMS, dimid, &varid[1]); CHECK_ERR - err = ncmpi_def_var(ncid, "var2", NC_TYPE($1), NDIMS, dimid, &varid[2]); CHECK_ERR - err = ncmpi_def_var(ncid, "var3", NC_TYPE($1), NDIMS, dimid, &varid[3]); CHECK_ERR - err = ncmpi_enddef(ncid); CHECK_ERR - /* allocate space for starts and counts */ starts[0] = (MPI_Offset**) malloc(sizeof(MPI_Offset*) * 4 * 6); counts[0] = (MPI_Offset**) malloc(sizeof(MPI_Offset*) * 4 * 6); @@ -338,17 +348,32 @@ test_bput_varn_$1(char *filename, int cdf) } } + /* only rank 0, 1, 2, and 3 do I/O: + * each of ranks 0 to 3 write nreqs nonblocking requests */ + nreqs = 4; + if (rank >= 4) nreqs = 0; + + /* create a global array of size NY * NX */ + err = ncmpi_def_dim(ncid, "Y", NY, &dimid[0]); CHECK_ERR + err = ncmpi_def_dim(ncid, "X", NX, &dimid[1]); CHECK_ERR + for (i=0; i= 4) nreqs = 0; - /* bufsize must be max of data type converted before and after */ MPI_Offset bufsize = 0; @@ -364,7 +389,7 @@ test_bput_varn_$1(char *filename, int cdf) /* allocate I/O buffer and initialize its contents */ buffer[i] = ($1*) malloc(sizeof($1) * req_lens[i]); - for (j=0; j 0) goto err_out; /* write using varn API, one bput call per variable */ for (i=0; i 0) goto err_out; + nerrs += check_num_pending_reqs(ncid, nreqs, __LINE__); - if (!bb_enabled) + if (nerrs > 0) goto err_out; + if (!bb_enabled) { /* burst buffering driver does not use attached memory */ nerrs += check_attached_buffer_usage(ncid, bufsize, bufsize, __LINE__); + if (nerrs > 0) goto err_out; + } - err = ncmpi_wait_all(ncid, nreqs, reqs, sts); + if (coll_io) + err = ncmpi_wait_all(ncid, nreqs, reqs, sts); + else + err = ncmpi_wait(ncid, nreqs, reqs, sts); ERRS(nreqs, sts) - if (!bb_enabled) + if (!coll_io) { + /* When in independent I/O mode, some processes may read before other + * write completion from previous writes. + */ + err = ncmpi_flush(ncid); + if (err != NC_NOERR) goto err_out; + MPI_Barrier(MPI_COMM_WORLD); + } + + if (!bb_enabled) { /* now usgae of attached memory should be 0 */ nerrs += check_attached_buffer_usage(ncid, bufsize, 0, __LINE__); + if (nerrs > 0) goto err_out; + } /* all processes read entire variables back and check contents */ - nerrs += check_contents_for_fail_$1(ncid, varid); + nerrs += check_contents_for_fail_$1(ncid, varid, coll_io); + if (nerrs > 0) goto err_out; /* permute write order: so starts[*] are not in an increasing order: * swap segment 0 with segment 2 and swap segment 1 with segment 3 @@ -422,88 +472,140 @@ test_bput_varn_$1(char *filename, int cdf) } /* write using varn API, one bput call per variable */ - nerrs += clear_file_contents_$1(ncid, varid); + nerrs += clear_file_contents_$1(ncid, varid, coll_io); + if (nerrs > 0) goto err_out; for (i=0; i 0) goto err_out; + nerrs += check_num_pending_reqs(ncid, nreqs, __LINE__); - if (!bb_enabled) + if (nerrs > 0) goto err_out; + if (!bb_enabled) { /* burst buffering driver does not use attached memory */ nerrs += check_attached_buffer_usage(ncid, bufsize, bufsize, __LINE__); + if (nerrs > 0) goto err_out; + } - err = ncmpi_wait_all(ncid, nreqs, reqs, sts); + if (coll_io) + err = ncmpi_wait_all(ncid, nreqs, reqs, sts); + else + err = ncmpi_wait(ncid, nreqs, reqs, sts); ERRS(nreqs, sts) - if (!bb_enabled) + if (!coll_io) { + /* When in independent I/O mode, some processes may read before other + * write completion from previous writes. + */ + err = ncmpi_flush(ncid); + if (err != NC_NOERR) goto err_out; + MPI_Barrier(MPI_COMM_WORLD); + } + + if (!bb_enabled) { /* now usgae of attached memory should be 0 */ nerrs += check_attached_buffer_usage(ncid, bufsize, 0, __LINE__); + if (nerrs > 0) goto err_out; + } /* all processes read entire variables back and check contents */ - nerrs += check_contents_for_fail_$1(ncid, varid); + nerrs += check_contents_for_fail_$1(ncid, varid, coll_io); + if (nerrs > 0) goto err_out; for (i=0; i 0) goto err_out; + nerrs += check_num_pending_reqs(ncid, nreqs, __LINE__); - if (!bb_enabled) + if (!bb_enabled) { /* burst buffering driver does not use attached memory */ nerrs += check_attached_buffer_usage(ncid, bufsize, bufsize, __LINE__); + if (nerrs > 0) goto err_out; + } - err = ncmpi_wait_all(ncid, nreqs, reqs, sts); + if (coll_io) + err = ncmpi_wait_all(ncid, nreqs, reqs, sts); + else + err = ncmpi_wait(ncid, nreqs, reqs, sts); ERRS(nreqs, sts) + if (!coll_io) { + /* When in independent I/O mode, some processes may read before other + * write completion from previous writes. + */ + err = ncmpi_flush(ncid); + if (err != NC_NOERR) goto err_out; + MPI_Barrier(MPI_COMM_WORLD); + } + /* check if write buffer contents have been altered */ for (i=0; i 0) goto err_out; - if (!bb_enabled) + if (!bb_enabled) { /* now usgae of attached memory should be 0 */ nerrs += check_attached_buffer_usage(ncid, bufsize, 0, __LINE__); + if (nerrs > 0) goto err_out; + } /* all processes read entire variables back and check contents */ - nerrs += check_contents_for_fail_$1(ncid, varid); + nerrs += check_contents_for_fail_$1(ncid, varid, coll_io); + if (nerrs > 0) goto err_out; /* permute back to original order */ for (i=0; i 0) goto err_out; for (i=0; i 0) goto err_out; + nerrs += check_num_pending_reqs(ncid, nreqs, __LINE__); - if (!bb_enabled) + if (!bb_enabled) { /* burst buffering driver does not use attached memory */ nerrs += check_attached_buffer_usage(ncid, bufsize, bufsize, __LINE__); + if (nerrs > 0) goto err_out; + } - err = ncmpi_wait_all(ncid, nreqs, reqs, sts); + if (coll_io) + err = ncmpi_wait_all(ncid, nreqs, reqs, sts); + else + err = ncmpi_wait(ncid, nreqs, reqs, sts); ERRS(nreqs, sts) + if (!coll_io) { + /* When in independent I/O mode, some processes may read before other + * write completion from previous writes. + */ + err = ncmpi_flush(ncid); + if (err != NC_NOERR) goto err_out; + MPI_Barrier(MPI_COMM_WORLD); + } + /* check if write buffer contents have been altered */ for (i=0; i 0) goto err_out; - if (!bb_enabled) + if (!bb_enabled) { /* now usgae of attached memory should be 0 */ nerrs += check_attached_buffer_usage(ncid, bufsize, 0, __LINE__); + if (nerrs > 0) goto err_out; + } /* all processes read entire variables back and check contents */ - nerrs += check_contents_for_fail_$1(ncid, varid); + nerrs += check_contents_for_fail_$1(ncid, varid, coll_io); +err_out: /* free the buffer space for bput */ if (bufsize > 0) { err = ncmpi_buffer_detach(ncid); CHECK_ERR @@ -570,6 +700,9 @@ test_bput_varn_$1(char *filename, int cdf) err = ncmpi_inq_buffer_usage(ncid, &bufsize); EXP_ERR(NC_ENULLABUF) + err = ncmpi_inq_format(ncid, &fmt); + assert(format == fmt); + err = ncmpi_close(ncid); CHECK_ERR for (i=0; i 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for bput_varn ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); - free(cmd_str); - } - - for (i=0; i<3; i++) { - nerrs += test_bput_varn_text(filename, cdf_formats[i]); - nerrs += test_bput_varn_schar(filename, cdf_formats[i]); - nerrs += test_bput_varn_short(filename, cdf_formats[i]); - nerrs += test_bput_varn_int(filename, cdf_formats[i]); - nerrs += test_bput_varn_float(filename, cdf_formats[i]); - nerrs += test_bput_varn_double(filename, cdf_formats[i]); - if (cdf_formats[i] == NC_FORMAT_CDF5) { - nerrs += test_bput_varn_uchar(filename, cdf_formats[i]); - nerrs += test_bput_varn_ushort(filename, cdf_formats[i]); - nerrs += test_bput_varn_uint(filename, cdf_formats[i]); - nerrs += test_bput_varn_longlong(filename, cdf_formats[i]); - nerrs += test_bput_varn_ulonglong(filename, cdf_formats[i]); - } +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ + int err; + + err = TEST_DATA_TYPE(text); if (err > 0) return err; + err = TEST_DATA_TYPE(schar); if (err > 0) return err; + err = TEST_DATA_TYPE(short); if (err > 0) return err; + err = TEST_DATA_TYPE(int); if (err > 0) return err; + err = TEST_DATA_TYPE(float); if (err > 0) return err; + err = TEST_DATA_TYPE(double); if (err > 0) return err; + if (format == NC_FORMAT_64BIT_DATA) { + err = TEST_DATA_TYPE(uchar); if (err > 0) return err; + err = TEST_DATA_TYPE(ushort); if (err > 0) return err; + err = TEST_DATA_TYPE(uint); if (err > 0) return err; + err = TEST_DATA_TYPE(longlong); if (err > 0) return err; + err = TEST_DATA_TYPE(ulonglong); if (err > 0) return err; } - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return 0; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "bput_varn", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/nonblocking/column_wise.m4 b/test/nonblocking/column_wise.m4 index 9bc15958c6..e202e252f6 100644 --- a/test/nonblocking/column_wise.m4 +++ b/test/nonblocking/column_wise.m4 @@ -25,7 +25,7 @@ * * % m4 column_wise.m4 > column_wise.c * % mpicc -O2 -o column_wise column_wise.c -lpnetcdf - * % mpiexec -l -n 4 ./column_wise /pvfs2/wkliao/testfile.nc + * % mpiexec -l -n 4 ./column_wise -l 4 /pvfs2/wkliao/testfile.nc * 0: 0: myOff= 0 myNX= 4 * 1: 1: myOff= 4 myNX= 4 * 2: 2: myOff= 8 myNX= 4 @@ -67,37 +67,40 @@ #include #include /* strcpy() */ #include /* basename() */ +#include /* getopt() */ + #include #include #include #define NY 10 -#define NX 4 +#define NX 70 typedef char text; include(`foreach.m4')dnl include(`utils.m4')dnl +define(`TEST_DATA_TYPE',`test_column_wise_$1(out_path, format, coll_io, info)') + define(`TEST_COLUMN_WISE',`dnl static -int test_column_wise_$1(char *filename, int cdf) +int test_column_wise_$1(const char *out_path, int format, int coll_io, MPI_Info info) { int i, j, nerrs=0, rank, nprocs, err, myNX, G_NX, myOff, num_reqs; - int ncid, cmode, varid, dimid[2], *reqs, *sts; + int fmt, ncid, varid, dimid[2], *reqs, *sts; $1 **buf; MPI_Offset start[2], count[2]; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - cmode = NC_CLOBBER; - if (cdf == NC_FORMAT_CDF2) - cmode |= NC_64BIT_OFFSET; - else if (cdf == NC_FORMAT_CDF5) - cmode |= NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR /* the global array is NY * (NX * nprocs) */ @@ -124,6 +127,11 @@ int test_column_wise_$1(char *filename, int cdf) */ err = ncmpi_flush(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* initialize the buffer with rank ID. Also make the case interesting, by allocatsing buffersd separately */ for (i=0; i 0) return err; + err = TEST_DATA_TYPE(schar); if (err > 0) return err; + err = TEST_DATA_TYPE(short); if (err > 0) return err; + err = TEST_DATA_TYPE(int); if (err > 0) return err; + err = TEST_DATA_TYPE(float); if (err > 0) return err; + err = TEST_DATA_TYPE(double); if (err > 0) return err; + if (format == NC_FORMAT_CDF5) { + err = TEST_DATA_TYPE(uchar); if (err > 0) return err; + err = TEST_DATA_TYPE(ushort); if (err > 0) return err; + err = TEST_DATA_TYPE(uint); if (err > 0) return err; + err = TEST_DATA_TYPE(longlong); if (err > 0) return err; + err = TEST_DATA_TYPE(ulonglong); if (err > 0) return err; + } - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); + return 0; +} - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for iput/iget interleaved access ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); - free(cmd_str); - } +int main(int argc, char **argv) { - for (i=0; i<3; i++) { - nerrs += test_column_wise_text(filename, cdf_formats[i]); - if (nerrs > 0) break; - nerrs += test_column_wise_schar(filename, cdf_formats[i]); - if (nerrs > 0) break; - nerrs += test_column_wise_short(filename, cdf_formats[i]); - if (nerrs > 0) break; - nerrs += test_column_wise_int(filename, cdf_formats[i]); - if (nerrs > 0) break; - nerrs += test_column_wise_float(filename, cdf_formats[i]); - if (nerrs > 0) break; - nerrs += test_column_wise_double(filename, cdf_formats[i]); - if (nerrs > 0) break; - if (cdf_formats[i] == NC_FORMAT_CDF5) { - nerrs += test_column_wise_uchar(filename, cdf_formats[i]); - if (nerrs > 0) break; - nerrs += test_column_wise_ushort(filename, cdf_formats[i]); - if (nerrs > 0) break; - nerrs += test_column_wise_uint(filename, cdf_formats[i]); - if (nerrs > 0) break; - nerrs += test_column_wise_longlong(filename, cdf_formats[i]); - if (nerrs > 0) break; - nerrs += test_column_wise_ulonglong(filename, cdf_formats[i]); - if (nerrs > 0) break; - } - } + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + MPI_Init(&argc, &argv); - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "iput/iget interleaved access", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/nonblocking/flexible_bput.c b/test/nonblocking/flexible_bput.c index 468af74b6c..efe45aed2a 100644 --- a/test/nonblocking/flexible_bput.c +++ b/test/nonblocking/flexible_bput.c @@ -21,7 +21,7 @@ * * % mpicc -O2 -o flexible_bput flexible_bput.c -lpnetcdf * - * % mpiexec -l -n 4 ./flexible_bput /pvfs2/wkliao/testfile.nc + * % mpiexec -l -n 4 ./flexible_bput -l 4 /pvfs2/wkliao/testfile.nc * * % ncmpidump /pvfs2/wkliao/testfile.nc * netcdf testfile { @@ -48,14 +48,16 @@ #include #include /* strcpy() */ #include /* basename() */ +#include /* getopt() */ #include + #include #include #include #define NY 6 -#define NX 4 +#define NX 70 #define GHOST 2 #define INIT_PUT_BUF(buf) \ @@ -78,6 +80,7 @@ printf("Error at line %d in %s: put buffer altered buffer[%d][%d]=%f\n", \ __LINE__,__FILE__,i,j,(double)buf[i][j]); \ nerrs++; \ + goto err_out; \ } \ } \ else { \ @@ -85,6 +88,7 @@ printf("Error at line %d in %s: put buffer altered buffer[%d][%d]=%f\n", \ __LINE__,__FILE__,i,j,(double)buf[i][j]); \ nerrs++; \ + goto err_out; \ } \ } \ } \ @@ -101,55 +105,58 @@ if (i < GHOST || GHOST+array_of_subsizes[0] <= i || \ j < GHOST || GHOST+array_of_subsizes[1] <= j) { \ if (buf[i][j] != -2) { \ - printf("Error at line %d in %s: unexpected get buffer[%d][%d]=%f\n", \ - __LINE__,__FILE__,i,j,(double)buf[i][j]); \ + printf("Error at line %d in %s: expect buffer[%d][%d] to be %d but got %f\n", \ + __LINE__,__FILE__,i,j,-2,(double)buf[i][j]); \ nerrs++; \ + goto err_out; \ } \ } \ else { \ - if (buf[i][j] != (i-GHOST)*array_of_subsizes[1]+(j-GHOST)) { \ - printf("Error at line %d in %s: unexpected get buffer[%d][%d]=%f\n", \ - __LINE__,__FILE__,i,j,(double)buf[i][j]); \ + int exp = (i-GHOST)*array_of_subsizes[1]+(j-GHOST); \ + if (buf[i][j] != exp) { \ + printf("Error at line %d in %s: expect buffer[%d][%d] to be %d but got %f\n", \ + __LINE__,__FILE__,i,j,exp,(double)buf[i][j]); \ nerrs++; \ + goto err_out; \ } \ } \ } \ } -int main(int argc, char** argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - char filename[256]; int i, j, rank, nprocs, err, nerrs=0, req, status; - int ncid, cmode, varid, dimid[2]; + int ncid, varid, dimid[2]; int array_of_sizes[2], array_of_subsizes[2], array_of_starts[2]; - int buf_int[NX+2*GHOST][NY+2*GHOST]; - double buf_dbl[NX+2*GHOST][NY+2*GHOST]; + int **buf_int; + double **buf_dbl; MPI_Offset start[2], count[2], stride[2], imap[2]; MPI_Datatype subarray; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for flexible bput_varm ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); - free(cmd_str); - } + buf_int = (int**) malloc(sizeof(int*) * (NX+2*GHOST)); + buf_int[0] = (int*) malloc(sizeof(int) * (NX+2*GHOST) * (NY+2*GHOST)); + for (i=1; i<(NX+2*GHOST); i++) + buf_int[i] = buf_int[i-1] + (NY+2*GHOST); + + buf_dbl = (double**) malloc(sizeof(double*) * (NX+2*GHOST)); + buf_dbl[0] = (double*) malloc(sizeof(double) * (NX+2*GHOST) * (NY+2*GHOST)); + for (i=1; i<(NX+2*GHOST); i++) + buf_dbl[i] = buf_dbl[i-1] + (NY+2*GHOST); + + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR /* define 2 dimensions */ @@ -160,6 +167,11 @@ int main(int argc, char** argv) err = ncmpi_def_var(ncid, "var", NC_DOUBLE, 2, dimid, &varid); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + start[0] = 0; start[1] = NX * rank; count[0] = NY; count[1] = NX; stride[0] = 1; stride[1] = 1; @@ -184,13 +196,18 @@ int main(int argc, char** argv) for (i=0; i<2; i++) bufsize *= count[i]; err = ncmpi_buffer_attach(ncid, bufsize); CHECK_ERR - err = ncmpi_bput_varm(ncid, varid, start, count, stride, imap, buf_int, + err = ncmpi_bput_varm(ncid, varid, start, count, stride, imap, buf_int[0], 1, subarray, &req); CHECK_ERR /* check if the contents of put buffer are altered */ CHECK_PUT_BUF(buf_int) - err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR + if (coll_io) + err = ncmpi_wait_all(ncid, 1, &req, &status); + else + err = ncmpi_wait(ncid, 1, &req, &status); + + CHECK_ERR err = status; CHECK_ERR /* check the contents of put buffer are altered */ @@ -198,15 +215,29 @@ int main(int argc, char** argv) err = ncmpi_buffer_detach(ncid); CHECK_ERR + /* file sync before reading */ + err = ncmpi_sync(ncid); + CHECK_ERR + MPI_Barrier(MPI_COMM_WORLD); + /* read back using a blocking get_varm flexible API ---------------------*/ /* initiate get buffer contents */ INIT_GET_BUF(buf_int) - /* calling a blocking flexible API */ - err = ncmpi_get_varm_all(ncid, varid, start, count, stride, imap, buf_int, - 1, subarray); + if (!coll_io) { + /* calling a blocking flexible API */ + err = ncmpi_end_indep_data(ncid); + CHECK_ERR + } + err = ncmpi_get_varm_all(ncid, varid, start, count, stride, imap, + buf_int[0], 1, subarray); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* check the contents of get buffer */ CHECK_GET_BUF(buf_int) @@ -224,14 +255,18 @@ int main(int argc, char** argv) err = ncmpi_buffer_attach(ncid, bufsize); CHECK_ERR - err = ncmpi_bput_varm(ncid, varid, start, count, stride, imap, buf_dbl, + err = ncmpi_bput_varm(ncid, varid, start, count, stride, imap, buf_dbl[0], 1, subarray, &req); CHECK_ERR /* check the contents of put buffer are altered */ CHECK_PUT_BUF(buf_dbl) - err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR + if (coll_io) + err = ncmpi_wait_all(ncid, 1, &req, &status); + else + err = ncmpi_wait(ncid, 1, &req, &status); + CHECK_ERR err = status; CHECK_ERR /* check the contents of put buffer are altered */ @@ -244,36 +279,52 @@ int main(int argc, char** argv) INIT_GET_BUF(buf_dbl) /* calling a blocking flexible API */ - err = ncmpi_get_varm_all(ncid, varid, start, count, stride, imap, buf_dbl, - 1, subarray); + if (coll_io) + err = ncmpi_get_varm_all(ncid, varid, start, count, stride, imap, + buf_dbl[0], 1, subarray); + else + err = ncmpi_get_varm(ncid, varid, start, count, stride, imap, + buf_dbl[0], 1, subarray); CHECK_ERR /* check the contents of get buffer */ CHECK_GET_BUF(buf_dbl) +err_out: MPI_Type_free(&subarray); err = ncmpi_close(ncid); CHECK_ERR - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) { - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - } - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + free(buf_int[0]); + free(buf_int); + free(buf_dbl[0]); + free(buf_dbl); - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + return nerrs; +} + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "flexible bput_varm", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/nonblocking/i_varn_indef.c b/test/nonblocking/i_varn_indef.c index 9617d99ad5..25405f7c63 100644 --- a/test/nonblocking/i_varn_indef.c +++ b/test/nonblocking/i_varn_indef.c @@ -8,10 +8,10 @@ /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * This example tests posting nonblocking varn APIs, including - * ncmpi_iput_varn_longlong(), ncmpi_iget_varn_longlong(), ncmpi_iput_varn(), + * ncmpi_iput_varn_int(), ncmpi_iget_varn_int(), ncmpi_iput_varn(), * and ncmpi_iget_varn(), in define mode. * It first writes a sequence of requests with arbitrary array indices and - * lengths to four variables of type NC_INT64, and reads back. + * lengths to four variables of type NC_INT, and reads back. * * The compile and run commands are given below, together with an ncmpidump of * the output file. @@ -82,59 +82,48 @@ } static -int clear_file_contents(int ncid, int *varid) -{ - int i, err, nerrs=0, rank; - long long *w_buffer = (long long*) malloc(sizeof(long long) * NY*NX); - for (i=0; i 4) MPI_Barrier(MPI_COMM_WORLD); for (i=0; i<4; i++) { for (j=0; j= nprocs) continue; if (r_buffer[j] != expected[i][j]) { - printf("Error at line %d in %s: Expected read buf[%d][%d]=%lld, but got %lld\n", + printf("Error at line %d in %s: Expected read buf[%d][%d]=%d, but got %d\n", lineno,__FILE__,i,j,expected[i][j],r_buffer[j]); nerrs++; } @@ -170,15 +159,19 @@ void permute(MPI_Offset *a, MPI_Offset *b) } } -int main(int argc, char** argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - char filename[256], *varname[4]; + char *varname[4]; int i, j, k, rank, nprocs, err, nerrs=0, bufsize=0; - int ncid, cmode, varid[4], dimid[2], nreqs, reqs[12], sts[4]; - long long *buffer[4], *cbuffer[4], *rbuffer[4]; + int ncid, varid[4], dimid[2], nreqs, reqs[12], sts[4]; + int *buffer[4], *cbuffer[4], *rbuffer[4]; int num_segs[4] = {4, 6, 5, 4}; int req_lens[4], my_nsegs[4]; - int bb_enabled=0; MPI_Datatype buftype[4]; MPI_Offset **starts[4], **counts[4]; @@ -212,26 +205,9 @@ int main(int argc, char** argv) - - - X X X - - - - - - - - - - - X X X */ - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for iput/iget varn in define mode ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); - free(cmd_str); - } - #ifdef DEBUG if (nprocs != 4 && rank == 0) printf("Warning: %s is intended to run on 4 processes\n",argv[0]); @@ -289,7 +265,7 @@ int main(int argc, char** argv) } /* allocate I/O buffer and initialize its contents */ - buffer[i] = (long long*) malloc(sizeof(long long) * req_lens[i]); + buffer[i] = (int*) malloc(sizeof(int) * req_lens[i]); for (j=0; j0) cbuffer[0] = (long long*) malloc(sizeof(long long) * bufsize); + if (bufsize>0) cbuffer[0] = (int*) malloc(sizeof(int) * bufsize); for (i=1; i 0) { - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - } - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "iput/iget varn in define mode", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/nonblocking/i_varn_int64.c b/test/nonblocking/i_varn_int64.c index 52700d8b1d..4c20a554b3 100644 --- a/test/nonblocking/i_varn_int64.c +++ b/test/nonblocking/i_varn_int64.c @@ -111,13 +111,13 @@ if ((a)[_i] != NC_NOERR) { \ printf("Error at line %d in %s: err[%d] %s\n", __LINE__, __FILE__, _i, \ ncmpi_strerrno((a)[_i])); \ - nerrs++; \ + assert(0); \ } \ } \ } static -int clear_file_contents(int ncid, int *varid) +int clear_file_contents(int ncid, int *varid, int coll_io) { int i, err, rank, nerrs=0; MPI_Offset start[2], count[2]; @@ -127,6 +127,8 @@ int clear_file_contents(int ncid, int *varid) MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Barrier(MPI_COMM_WORLD); + start[0] = start[1] = count[0] = count[1] = 0; if (rank == 0) { /* only rank 0 writes */ count[0] = NY; @@ -138,21 +140,25 @@ int clear_file_contents(int ncid, int *varid) err = ncmpi_iput_vara_longlong(ncid, varid[i], start, count, w_buffer, NULL); CHECK_ERR } - err = ncmpi_wait_all(ncid, NC_REQ_ALL, NULL, NULL); + if (coll_io) + err = ncmpi_wait_all(ncid, NC_REQ_ALL, NULL, NULL); + else + err = ncmpi_wait(ncid, NC_REQ_ALL, NULL, NULL); CHECK_ERR - free(w_buffer); - /* When using burst buffering, flush the log to prevent new value being * skipped due to overlaping domain */ err = ncmpi_flush(ncid); CHECK_ERR + MPI_Barrier(MPI_COMM_WORLD); + + free(w_buffer); return nerrs; } static -int check_contents_for_fail(int ncid, int *varid) +int check_contents_for_fail(int ncid, int *varid, int coll_io) { /* all processes read entire variables back and check contents */ int i, j, nerrs=0, err, nprocs; @@ -176,20 +182,26 @@ int check_contents_for_fail(int ncid, int *varid) long long *r_buffer = (long long*) malloc(sizeof(long long) * NY*NX); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (nprocs > 4) MPI_Barrier(MPI_COMM_WORLD); + + /* file sync before reading */ + err = ncmpi_sync(ncid); CHECK_ERR + MPI_Barrier(MPI_COMM_WORLD); for (i=0; i<4; i++) { for (j=0; j= nprocs) continue; if (r_buffer[j] != expected[i][j]) { - printf("Error at line %d in %s: xxpect read buf[%d][%d]=%lld, but got %lld\n", + printf("Error at line %d in %s: expect read buf[%d][%d]=%lld, but got %lld\n", __LINE__,__FILE__,i,j,expected[i][j],r_buffer[j]); - nerrs++; + assert(0); } } } @@ -207,7 +219,7 @@ check_num_pending_reqs(int ncid, int expected, int lineno) if (n_pendings != expected) { printf("Error at line %d in %s: expect %d pending requests but got %d\n", lineno, __FILE__, expected, n_pendings); - nerrs++; + assert(0); } return nerrs; } @@ -224,11 +236,11 @@ void permute(MPI_Offset *a, MPI_Offset *b) } static int -test_varn(int ncid, int rank, int *varid) +test_varn(int ncid, int rank, int *varid, int coll_io) { int i, j, k, err, nerrs=0, bufsize=0; - int nreqs, reqs[4], sts[4]; - long long *buffer[4], *cbuffer[4]; + int nreqs=0, reqs[4], sts[4]; + long long *wbuf[4], *c_wbuf[4], *rbuf[4], *c_rbuf[4]; int num_segs[4] = {4, 6, 5, 4}; int req_lens[4], my_nsegs[4]; MPI_Offset **starts[4], **counts[4]; @@ -263,6 +275,9 @@ test_varn(int ncid, int rank, int *varid) - - - - - - - X X X */ + c_wbuf[0] = NULL; + c_rbuf[0] = NULL; + /* allocate space for starts and counts */ starts[0] = (MPI_Offset**) malloc(sizeof(MPI_Offset*) * 4 * 6); counts[0] = (MPI_Offset**) malloc(sizeof(MPI_Offset*) * 4 * 6); @@ -299,7 +314,7 @@ test_varn(int ncid, int rank, int *varid) if (err != NC_ENULLSTART) { printf("expecting error code NC_ENULLSTART but got %s\n", nc_err_code_name(err)); - nerrs++; + assert(0); } /* only rank 0, 1, 2, and 3 do I/O: @@ -318,71 +333,89 @@ test_varn(int ncid, int rank, int *varid) } /* allocate I/O buffer and initialize its contents */ - buffer[i] = (long long*) malloc(sizeof(long long) * req_lens[i]); - for (j=0; j0) { + c_wbuf[0] = (long long*) malloc(sizeof(long long) * bufsize); + c_rbuf[0] = (long long*) malloc(sizeof(long long) * bufsize); + for (i=1; i0) { - cbuffer[0] = (long long*) malloc(sizeof(long long) * bufsize); - for (i=1; i0) free(cbuffer[0]); - for (i=0; i 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for iput/iget varn ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); - free(cmd_str); - } #ifdef DEBUG + int nprocs; + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); if (nprocs != 4 && rank == 0) printf("Warning: %s is intended to run on 4 processes\n",argv[0]); #endif + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); FATAL_ERR /* create fixed-size variables of size NY * NX */ @@ -598,8 +647,13 @@ int main(int argc, char** argv) err = ncmpi_def_var(ncid, "var3", NC_INT64, NDIMS, dimid, &varid[3]); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* test fixed-size variables */ - nerrs += test_varn(ncid, rank, varid); + nerrs += test_varn(ncid, rank, varid, coll_io); err = ncmpi_redef(ncid); CHECK_ERR @@ -611,30 +665,41 @@ int main(int argc, char** argv) err = ncmpi_def_var(ncid, "t_var3", NC_INT64, NDIMS, dimid, &varid[3]); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* test record variables */ - nerrs += test_varn(ncid, rank, varid); + nerrs += test_varn(ncid, rank, varid, coll_io); err = ncmpi_close(ncid); CHECK_ERR - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) { - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - } - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 1; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "iput/iget varn", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/nonblocking/interleaved.c b/test/nonblocking/interleaved.c index 606b74be57..6acfeb75ff 100644 --- a/test/nonblocking/interleaved.c +++ b/test/nonblocking/interleaved.c @@ -83,48 +83,40 @@ nerrs++; \ } \ } -int main(int argc, char** argv) + +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info global_info) { - char filename[256]; int i, j, rank, nprocs, err, nerrs=0, expected; - int ncid, cmode, varid[2], dimid[2], req[4], st[4], *buf; - int *buf0, *buf1, *buf2; + int ncid, varid[2], dimid[2], req[4], st[4], *buf=NULL; + int *buf0=NULL, *buf1=NULL, *buf2=NULL; size_t len; MPI_Offset start[2], count[2]; MPI_Info info; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* this program is intended to run on one process */ - if (rank) goto fn_exit; - - /* get command-line arguments */ - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for writing interleaved fileviews ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); - free(cmd_str); - } + if (rank) goto err_out; - MPI_Info_create(&info); + MPI_Info_dup(global_info, &info); MPI_Info_set(info, "romio_cb_write", "disable"); MPI_Info_set(info, "ind_wr_buffer_size", "8"); /* these 2 hints are required to cause a core dump if r1758 fix is not * presented */ + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_SELF, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_SELF, out_path, NC_CLOBBER, info, &ncid); + CHECK_FATAL_ERR MPI_Info_free(&info); @@ -142,12 +134,21 @@ int main(int argc, char** argv) /* do not forget to exit define mode */ err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* now we are in data mode */ buf = (int*) malloc(sizeof(int) * NY*NX); /* fill the entire variable var0 with -1s */ for (i=0; i 0) { - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", malloc_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } +err_out: + MPI_Barrier(MPI_COMM_WORLD); -fn_exit: - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + return nerrs; +} + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "writing interleaved fileviews", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/nonblocking/large_num_reqs.c b/test/nonblocking/large_num_reqs.c index 3ea4e3211f..668767e300 100644 --- a/test/nonblocking/large_num_reqs.c +++ b/test/nonblocking/large_num_reqs.c @@ -22,34 +22,25 @@ #define FILE_NAME "testfile.nc" #define NUM_REQS 1100 /* a number greater than NC_REQUEST_CHUNK */ -/*----< main() >------------------------------------------------------------*/ -int main(int argc, char **argv) { +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ int i, ncid, dimid[2], varid, err, nerrs=0, rank, nprocs; int *buf, *req, *status; - char filename[256]; MPI_Offset start[2], count[2]; - MPI_Info info=MPI_INFO_NULL; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for large number of iput/iget ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); - free(cmd_str); - } + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "Y", NC_UNLIMITED, &dimid[0]); CHECK_ERR #define STRESS_ROMIO @@ -66,6 +57,11 @@ int main(int argc, char **argv) { err = ncmpi_def_var(ncid, "var", NC_INT, 2, dimid, &varid); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + req = (int*) malloc(sizeof(int) * NUM_REQS * 2); status = req + NUM_REQS; @@ -85,13 +81,22 @@ int main(int argc, char **argv) { start[0] += 3; } - err = ncmpi_wait_all(ncid, NUM_REQS, req, status); CHECK_ERR + if (coll_io) + err = ncmpi_wait_all(ncid, NUM_REQS, req, status); + else + err = ncmpi_wait(ncid, NUM_REQS, req, status); + CHECK_ERR + /* check each iput status */ for (i=0; i 0) { - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - } - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "large number of iput/iget", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/nonblocking/mcoll_perf.c b/test/nonblocking/mcoll_perf.c index 9d0cf1f4a2..21e1582001 100644 --- a/test/nonblocking/mcoll_perf.c +++ b/test/nonblocking/mcoll_perf.c @@ -300,19 +300,26 @@ int ncmpi_diff(char *filename1, char *filename2) } -int main(int argc, char **argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) { + /* file sync before reading */ + extern int optind; + extern char *optarg; int i, j, array_of_gsizes[3]; int nprocs, **buf, rank; MPI_Offset bufcount; int array_of_psizes[3]; int err, nerrs=0; MPI_Offset array_of_starts[3], stride[3]; - char fbasename[256], filename[512]; + char filename[512]; char filename1[512], filename2[512], filename3[512]; char dimname[20], varname[20]; int ncid, dimids0[3], dimids1[3], rank_dim[3], *varid; - MPI_Info info; MPI_Offset **starts, **counts; MPI_Offset *bufcounts; int ndims = 3; @@ -324,33 +331,23 @@ int main(int argc, char **argv) int *sts; int *buf_var; int nvars2; + int keep_files; /* int buf_var[32] ={1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4}; */ - MPI_Init(&argc, &argv); + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); verbose = 0; - if (argc > 2) { - if (!rank) printf("Usage: %s [file base name]\n",argv[0]); - MPI_Finalize(); - nerrs++; goto fn_exit; - } - if (argc == 2) snprintf(fbasename, 256, "%s", argv[1]); - else strcpy(fbasename, "testfile"); - MPI_Bcast(fbasename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for mput/iput APIs ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); - free(cmd_str); - } + keep_files = 0; - length = 2; + length = 10; array_of_gsizes[0] = array_of_gsizes[1] = array_of_gsizes[2] = length; nvars = 4; @@ -459,7 +456,6 @@ int main(int argc, char **argv) printf("varid malloc error\n"); nerrs++; goto fn_exit; } - MPI_Info_create(&info); /* MPI_Info_set(info, "romio_pvfs2_posix_write", "enable"); MPI_Info_set(info, "group_cyclic_fd", "enable"); @@ -469,7 +465,7 @@ int main(int argc, char **argv) MPI_Info_set(info, "romio_cb_write", "true"); */ for (k=0; k<=9; k++) { - sprintf(filename, "%s.%d.%d.%d.nc", fbasename, length, nvars, k); + sprintf(filename, "%s.%d.%d.%d.nc", out_path, length, nvars, k); if (k==0) strcpy(filename1, filename); else if (k==7) @@ -674,13 +670,19 @@ printf("filename2=%s filename3=%s\n",filename2, filename3); if (rank == 0 && err == NC_NOERR && verbose) printf("\t OK\n"); */ - } else { - if (rank == 0 && verbose) + } else if (rank == 0 && verbose) printf("\t OK\n"); - } } } + if (rank == 0 && !keep_files) { + for (k=0; k<=9; k++) { + sprintf(filename, "%s.%d.%d.%d.nc", out_path, length, nvars, k); + unlink(filename); + } + } + MPI_Barrier(MPI_COMM_WORLD); + /* int nkeys; MPI_Info_get_nkeys(info, &nkeys); @@ -697,8 +699,6 @@ printf("filename2=%s filename3=%s\n",filename2, filename3); } */ - MPI_Info_free(&info); - for (i=0; i 0) { - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - } - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } +fn_exit: + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 0; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 0; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "mput/iput APIs", opt, test_io); -fn_exit: MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/nonblocking/mcoll_testf.f90 b/test/nonblocking/mcoll_testf.f90 index 0f079e863f..c126781cb0 100644 --- a/test/nonblocking/mcoll_testf.f90 +++ b/test/nonblocking/mcoll_testf.f90 @@ -51,7 +51,8 @@ program Mcoll_Testf ! determined by MPI where a ! zero is specified integer rank, Write_File - character(len=256) :: filename, cmd, msg + character(len=256) :: out_path, in_path, cmd, msg + logical keep_files real*4 filsiz real*4 rdt_l(2) @@ -67,22 +68,29 @@ program Mcoll_Testf ! data TOTSIZ_3D / 256, 256, 256 / data TOTSIZ_3D / 8, 8, 8 / + double precision timing + ! ---------------- ! Begin execution. ! ---------------- - call MPI_Init (ierr) + call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_Size(MPI_COMM_WORLD, totpes, ierr) call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) if (rank .EQ. 0) then - filename = 'testfile.nc' - err = get_args(cmd, filename) + out_path = 'testfile.nc' + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD,ierr) call MPI_Dims_Create (totpes, 3, numpes, ierr) @@ -146,7 +154,7 @@ program Mcoll_Testf locsiz = locsiz_3d(1) * locsiz_3d(2) * locsiz_3d(3) ! =============== - ierr = Write_File(filename, NWRITES, comm_cart, & + ierr = Write_File(out_path, NWRITES, comm_cart, & istart, jstart, kstart, locsiz, locsiz_3d, & TOTSIZ_3D, wrt_l) if (ierr .NE. NF90_NOERR) then @@ -180,9 +188,18 @@ program Mcoll_Testf call MPI_Comm_Free (comm_cart, ierr) + timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) + if (rank .EQ. 0) then - msg='*** TESTING F90 '//trim(cmd)//' for nf90mpi_iput_var API' - call pass_fail(0, msg) + if (.NOT. keep_files) then + err = nf90mpi_delete(out_path, MPI_INFO_NULL) + end if + + msg='*** TESTING F90 '//trim(cmd)//' - nf90mpi_iput_var APIs' + call pass_fail(0, msg, timing) endif 999 call MPI_Finalize(ierr) @@ -192,7 +209,7 @@ end program Mcoll_Testf ! ------------ - integer function Write_File(filename, nwrites, comm_cart, & + integer function Write_File(out_path, nwrites, comm_cart, & istart, jstart, kstart, locsiz, & locsiz_3d, totsiz_3d, wrt_l) @@ -204,7 +221,7 @@ integer function Write_File(filename, nwrites, comm_cart, & ! Argument declarations. ! ---------------------- - character (len=*) filename + character (len=*) out_path integer nwrites integer comm_cart INTEGER(KIND=MPI_OFFSET_KIND) istart, jstart, kstart @@ -261,7 +278,7 @@ integer function Write_File(filename, nwrites, comm_cart, & call MPI_Info_create(info, ierr) ! call MPI_Info_set(info, "romio_pvfs2_posix_write","enable",ierr) - Write_File = nf90mpi_create(comm_cart, filename, NF90_CLOBBER, & + Write_File = nf90mpi_create(comm_cart, out_path, NF90_CLOBBER, & info, ncid) if (Write_File .NE. NF90_NOERR) return diff --git a/test/nonblocking/mcoll_testf77.f b/test/nonblocking/mcoll_testf77.f index b5b7c6abee..3598d3f7df 100644 --- a/test/nonblocking/mcoll_testf77.f +++ b/test/nonblocking/mcoll_testf77.f @@ -69,7 +69,8 @@ program Mcoll_Testf ! zero is specified integer rank, Write_File - character(LEN=256) filename, cmd, msg + character(LEN=256) out_path, in_path, cmd, msg + logical keep_files real*4 filsiz @@ -87,24 +88,31 @@ program Mcoll_Testf ! data TOTSIZ_3D / 256, 256, 256 / data TOTSIZ_3D / 8, 8, 8 / + double precision timing + ! ---------------- ! Begin execution. ! ---------------- - call MPI_Init (ierr) + call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_Size(MPI_COMM_WORLD, totpes, ierr) call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) if (rank .EQ. 0) then - filename = "testfile.nc" - err = get_args(cmd, filename) + out_path = "testfile.nc" + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, + ierr) + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD,ierr) + call MPI_Dims_Create (totpes, 3, numpes, ierr) call MPI_Cart_Create(MPI_COMM_WORLD, 3, numpes, isperiodic, @@ -169,7 +177,7 @@ program Mcoll_Testf locsiz = locsiz_3d(1) * locsiz_3d(2) * locsiz_3d(3) ! =============== - ierr = Write_File(filename, NWRITES, comm_cart, + ierr = Write_File(out_path, NWRITES, comm_cart, + istart, jstart, kstart, locsiz, locsiz_3d, + TOTSIZ_3D, wrt_l) if (ierr .NE. NF_NOERR) then @@ -204,9 +212,18 @@ program Mcoll_Testf call MPI_Comm_Free (comm_cart, ierr) + timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, + + MPI_DOUBLE_PRECISION, MPI_MAX, + + MPI_COMM_WORLD, ierr) + if (rank .EQ. 0) then - msg = '*** TESTING F77 '//cmd(1:XTRIM(cmd))//' for iput API' - call pass_fail(0, msg) + if (.NOT. keep_files) then + err = nfmpi_delete(out_path, MPI_INFO_NULL) + end if + + msg = '*** TESTING F77 '//cmd(1:XTRIM(cmd))//' - iput API' + call pass_fail(0, msg, timing) endif 999 call MPI_Finalize (ierr) @@ -217,7 +234,7 @@ program Mcoll_Testf ! ------------ - integer function Write_File(filename, nwrites, comm_cart, + integer function Write_File(out_path, nwrites, comm_cart, + istart, jstart, kstart, locsiz, + locsiz_3d, totsiz_3d, wrt_l) @@ -229,7 +246,7 @@ integer function Write_File(filename, nwrites, comm_cart, ! Argument declarations. ! ---------------------- - character(LEN=*) filename + character(LEN=*) out_path integer nwrites integer comm_cart integer*8 istart, jstart, kstart @@ -291,7 +308,7 @@ integer function Write_File(filename, nwrites, comm_cart, call MPI_Info_create(info, ierr) ! call MPI_Info_set(info, "romio_pvfs2_posix_write", "enable",ierr) - Write_File = nfmpi_create(comm_cart, filename, NF_CLOBBER, + Write_File = nfmpi_create(comm_cart, out_path, NF_CLOBBER, + info, ncid) if (Write_File .NE. NF_NOERR) return diff --git a/test/nonblocking/parallel_run.sh b/test/nonblocking/parallel_run.sh index 25ffe7650b..da174c4a67 100755 --- a/test/nonblocking/parallel_run.sh +++ b/test/nonblocking/parallel_run.sh @@ -1,24 +1,30 @@ #!/bin/bash # -# Copyright (C) 2018, Northwestern University and Argonne National Laboratory +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. # # Exit immediately if a command exits with a non-zero status. set -e -VALIDATOR=../../src/utils/ncvalidator/ncvalidator -NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff +DRY_RUN=no +VERBOSE=no -# remove file system type prefix if there is any -OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $MPIRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $MPIRUN $@ + fi +} MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "MPIRUN = ${MPIRUN}" # echo "check_PROGRAMS=${check_PROGRAMS}" -# echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -27,58 +33,24 @@ fi # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS +if test "x$MIMIC_LUSTRE" != x1 ; then + PNETCDF_HINTS="cb_nodes=2" +fi + for i in ${check_PROGRAMS} ; do - for j in ${safe_modes} ; do - for intra_aggr in 0 1 ; do - if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" - else - export PNETCDF_HINTS= - fi - if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" - fi - export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc - if test "$i" != mcoll_perf ; then - # echo "--- validating file ${TESTOUTDIR}/$i.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc - fi - # echo "" + exe_name=`basename $i` - if test "x${ENABLE_BURST_BUFFER}" = x1 ; then - # echo "---- test burst buffering feature" - saved_PNETCDF_HINTS=${PNETCDF_HINTS} - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.bb.nc - export PNETCDF_HINTS=${saved_PNETCDF_HINTS} + for j in ${safe_modes} ; do - if test "$i" = mcoll_perf ; then - continue - fi + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi - # echo "--- validating file ${TESTOUTDIR}/$i.bb.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.nc + run_cmd ./$i -q -o ${TESTOUTDIR}/${exe_name}.nc - # burst buffering does not support nonblocking requests in define mode - if test $i != "i_varn_indef" ; then - # echo "--- ncmpidiff $i.nc $i.bb.nc ---" - ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.nc ${TESTOUTDIR}/$i.bb.nc - fi - fi + done # safe_modes - if test "x${ENABLE_NETCDF4}" = x1 ; then - # echo "test netCDF-4 feature" - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc4 4 - # Validator does not support nc4 - fi - done - done - rm -f ${OUTDIR}/$i.nc - rm -f ${OUTDIR}/$i.bb.nc - rm -f ${OUTDIR}/$i.nc.* - rm -f ${OUTDIR}/$i.bb.nc.* -done +done # check_PROGRAMS diff --git a/test/nonblocking/req_all.c b/test/nonblocking/req_all.c index bf793f13f3..7932c3d3f4 100644 --- a/test/nonblocking/req_all.c +++ b/test/nonblocking/req_all.c @@ -9,9 +9,9 @@ /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * This example shows how to use NC_REQ_ALL in nonblocking I/O operations. * The program writes 2 arrays by calling the nonblocking APIs with NULLs for - * argument request ID. When calling ncmpi_wait_all(), NC_REQ_ALL is used to - * commit all the pending requests without checking the individual statuses of - * the requests. + * argument request ID. When calling ncmpi_wait_all()/ncmpi_wait(), NC_REQ_ALL + * is used to commit all the pending requests without checking the individual + * statuses of the requests. * * To compile: * mpicc -O2 req_all.c -o req_all -lpnetcdf @@ -65,38 +65,28 @@ #define NY 8 #define NX 2 -int main(int argc, char** argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - char filename[256]; int i, j, rank, nprocs, nerrs=0, err; - int ncid, cmode, varid[2], dimid[2], buf_int[NY][NX]; + int ncid, varid[2], dimid[2], buf_int[NY][NX]; float buf_flt[NY][NX]; MPI_Offset global_ny, global_nx; MPI_Offset start[2], count[2]; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for NC_REQ_ALL ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); - free(cmd_str); - } + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR /* the global array is NY * (NX * nprocs) */ global_ny = NY; @@ -121,6 +111,11 @@ int main(int argc, char** argv) /* do not forget to exit define mode */ err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* now we are in data mode */ start[0] = 0; start[1] = NX * rank; @@ -130,7 +125,11 @@ int main(int argc, char** argv) err = ncmpi_iput_vara_int(ncid, varid[0], start, count, &buf_int[0][0], NULL); CHECK_ERR err = ncmpi_iput_vara_float(ncid, varid[1], start, count, &buf_flt[0][0], NULL); CHECK_ERR - err = ncmpi_wait_all(ncid, NC_REQ_ALL, NULL, NULL); CHECK_ERR + if (coll_io) + err = ncmpi_wait_all(ncid, NC_REQ_ALL, NULL, NULL); + else + err = ncmpi_wait(ncid, NC_REQ_ALL, NULL, NULL); + CHECK_ERR /* check if write buffer contents have been altered */ for (i=0; i 0) { - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - } - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "NC_REQ_ALL", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/nonblocking/seq_runs.sh b/test/nonblocking/seq_runs.sh index aa47560d30..205d1fb396 100755 --- a/test/nonblocking/seq_runs.sh +++ b/test/nonblocking/seq_runs.sh @@ -1,47 +1,44 @@ -#!/bin/sh +#!/bin/bash # -# Copyright (C) 2003, Northwestern University and Argonne National Laboratory +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. # -# "set -x" expands variables and prints a little + sign before the line - # Exit immediately if a command exits with a non-zero status. set -e -VALIDATOR=../../src/utils/ncvalidator/ncvalidator -NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff +DRY_RUN=no +VERBOSE=no + +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $TESTSEQRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $TESTSEQRUN $@ + fi +} + +if test "x${PNETCDF_DEBUG}" = x1 ; then + safe_modes="0 1" +else + safe_modes="0" +fi -# remove file system type prefix if there is any -OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` +exe_name=`basename $1` # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS -${TESTSEQRUN} ./mcoll_perf ${TESTOUTDIR}/testfile -# seq is not available on FreeBSD otherwise we can use: for j in `seq 0 9` -for j in 0 1 2 3 4 5 6 7 8 9 ; do - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/testfile.2.4.$j.nc -done - -# echo "" - -if test "x${ENABLE_BURST_BUFFER}" = x1 ; then - echo "" - echo "---- testing burst buffering" - export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" - ${TESTSEQRUN} ./mcoll_perf ${TESTOUTDIR}/testfile_bb - unset PNETCDF_HINTS - for j in 0 1 2 3 4 5 6 7 8 9 ; do - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/testfile_bb.2.4.$j.nc - - # echo "--- ncmpidiff testfile.2.4.$j.nc testfile_bb.2.4.$j.nc ---" - ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/testfile.2.4.$j.nc ${TESTOUTDIR}/testfile_bb.2.4.$j.nc - done -fi +for j in ${safe_modes} ; do + + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi + + run_cmd ./$1 -q -o ${TESTOUTDIR}/${exe_name}.nc -for j in 0 1 2 3 4 5 6 7 8 9 ; do - rm -f ${OUTDIR}/testfile.2.4.$j.nc - rm -f ${OUTDIR}/testfile_bb.2.4.$j.nc -done +done # safe_modes diff --git a/test/nonblocking/test_bput.c b/test/nonblocking/test_bput.c index 35ec82667e..2fb461b646 100644 --- a/test/nonblocking/test_bput.c +++ b/test/nonblocking/test_bput.c @@ -15,67 +15,54 @@ #include -#define FILE_NAME "testfile.nc" - -/*----< main() >------------------------------------------------------------*/ -int main(int argc, char **argv) { - int i, j, ncid, dimid[2], varid, err, nerrs=0, rank, nprocs, bb_enabled; - int req[2], status[2]; +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ + char hint[MPI_MAX_INFO_VAL]; + int i, j, ncid, dimid[2], varid, err, nerrs=0, rank, bb_enabled; + int flag, req[2], status[2]; float var[4][6]; - char filename[256]; MPI_Offset bufsize, start[2], count[2], stride[2], imap[2]; - MPI_Info info; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); #ifdef DEBUG + int nprocs; + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); if (nprocs > 1 && rank == 0) printf("Warning: %s is designed to run on 1 process\n", argv[0]); #endif - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for bput API ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); - free(cmd_str); - } + if (rank) goto err_out; - MPI_Info_create(&info); - /* MPI_Info_set(info, "romio_pvfs2_posix_write","enable"); */ + MPI_Info_get(info, "nc_burst_buf", MPI_MAX_INFO_VAL - 1, hint, &flag); + if (flag && strcasecmp(hint, "enable") == 0) + bb_enabled = 1; + else + bb_enabled = 0; - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER | NC_64BIT_DATA, info, &ncid); CHECK_ERR - MPI_Info_free(&info); + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR - { - int flag; - char hint[MPI_MAX_INFO_VAL]; - MPI_Info infoused; - - ncmpi_inq_file_info(ncid, &infoused); - MPI_Info_get(infoused, "nc_burst_buf", MPI_MAX_INFO_VAL - 1, hint, &flag); - if (flag && strcasecmp(hint, "enable") == 0) - bb_enabled = 1; - else - bb_enabled = 0; - MPI_Info_free(&infoused); - } + err = ncmpi_create(MPI_COMM_SELF, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR /* define a variable of a 6 x 4 integer array in the nc file */ err = ncmpi_def_dim(ncid, "Y", 6, &dimid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "X", 4, &dimid[1]); CHECK_ERR - err = ncmpi_def_var(ncid, "var", NC_INT64, 2, dimid, &varid); CHECK_ERR + err = ncmpi_def_var(ncid, "var", NC_INT, 2, dimid, &varid); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* set the contents of the write buffer var, a 4 x 6 float array 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, @@ -85,7 +72,7 @@ int main(int argc, char **argv) { for (j=0; j<4; j++) for (i=0; i<6; i++) var[j][i] = 50.5 + j*6+i; /* bufsize must be max of data type converted before and after */ - bufsize = 4*6*sizeof(long long); + bufsize = 4*6*sizeof(int); err = ncmpi_buffer_attach(ncid, bufsize); CHECK_ERR /* write var to the NC variable in the matrix transposed way */ @@ -106,7 +93,7 @@ int main(int argc, char **argv) { if (var[j][i] != 50.5 + j*6+i) { printf("Error at line %d in %s: put buffer[%d][%d]=%f altered, should be %f\n", __LINE__,__FILE__,j,i,var[j][i],50.5+j*6+i); - nerrs++; + assert(0); } } @@ -120,11 +107,15 @@ int main(int argc, char **argv) { if (var[j][i] != 50.5 + j*6+i) { printf("Error at line %d in %s: put buffer[%d][%d]=%f altered, should be %f\n", __LINE__,__FILE__,j,i,var[j][i],50.5+j*6+i); - nerrs++; + assert(0); } } - err = ncmpi_wait_all(ncid, 2, req, status); CHECK_ERR + if (coll_io) + err = ncmpi_wait_all(ncid, 2, req, status); + else + err = ncmpi_wait(ncid, 2, req, status); + CHECK_ERR /* check each bput status */ for (i=0; i<2; i++) { @@ -151,8 +142,7 @@ int main(int argc, char **argv) { /* this error is a pnetcdf internal error, if occurs */ printf("Error at line %d in %s: put buffer[%d][%d]=%f altered, should be %f\n", __LINE__,__FILE__,j,i,var[j][i],50.5+j*6+i); - nerrs++; - break; + assert(0); } } } @@ -167,25 +157,34 @@ int main(int argc, char **argv) { err = ncmpi_close(ncid); CHECK_ERR - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) { - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - } - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } +err_out: + MPI_Barrier(MPI_COMM_WORLD); - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + return nerrs; +} + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "bput API", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/nonblocking/test_bputf.f90 b/test/nonblocking/test_bputf.f90 index 56b9eaefa0..6dfccfae4b 100644 --- a/test/nonblocking/test_bputf.f90 +++ b/test/nonblocking/test_bputf.f90 @@ -34,21 +34,28 @@ program main integer(kind=MPI_OFFSET_KIND) bufsize, inq_bufsize integer(kind=MPI_OFFSET_KIND) usage, acc_usage real var(6,4) - character(len=256) :: filename, cmd, msg + character(len=256) :: out_path, in_path, cmd, msg character(len=512) :: hints + logical keep_files + double precision timing + + call MPI_Init(ierr) + + timing = MPI_Wtime() - call MPI_INIT(ierr) call MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierr) call MPI_COMM_SIZE(MPI_COMM_WORLD, nprocs, ierr) if (rank .EQ. 0) then - filename = "testfile.nc" - err = get_args(cmd, filename) + out_path = "testfile.nc" + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD,ierr) verbose = .FALSE. if (nprocs > 1 .AND. rank .EQ. 0 .AND. verbose) then @@ -66,7 +73,7 @@ program main ! call MPI_Info_set(info, "romio_pvfs2_posix_write","enable",ierr) cmode = IOR(NF90_CLOBBER, NF90_64BIT_DATA) - err = nf90mpi_create(MPI_COMM_WORLD, filename, cmode, & + err = nf90mpi_create(MPI_COMM_WORLD, out_path, cmode, & info, ncid) call check(err, 'Error at nf90mpi_create ') @@ -233,9 +240,18 @@ program main err = nf90mpi_close(ncid) call check(err, 'Error at nf90mpi_close ') + timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) + if (rank .EQ. 0) then - msg = '*** TESTING F90 '//trim(cmd)//' for bput_var' - call pass_fail(no_err, msg) + if (.NOT. keep_files) then + err = nf90mpi_delete(out_path, MPI_INFO_NULL) + end if + + msg = '*** TESTING F90 '//trim(cmd)//' - bput_var' + call pass_fail(no_err, msg, timing) endif 999 CALL MPI_Finalize(ierr) diff --git a/test/nonblocking/test_bputf77.f b/test/nonblocking/test_bputf77.f index 564b56c696..d45f9cfc14 100644 --- a/test/nonblocking/test_bputf77.f +++ b/test/nonblocking/test_bputf77.f @@ -44,24 +44,31 @@ program main integer*8 imap(2) integer*8 dim_size, bufsize, inq_bufsize real var(6,4) - character(LEN=256) filename, cmd, msg + character(LEN=256) out_path, in_path, cmd, msg integer*8 usage, acc_usage character(LEN=512) hints + logical keep_files + double precision timing + + call MPI_Init(ierr) + + timing = MPI_Wtime() - call MPI_INIT(ierr) call MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierr) call MPI_COMM_SIZE(MPI_COMM_WORLD, nprocs, ierr) if (rank .EQ. 0) then - filename = "testfile.nc" - err = get_args(cmd, filename) + out_path = "testfile.nc" + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, + ierr) + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD,ierr) + verbose = .FALSE. if (nprocs .GT. 1 .AND. rank .EQ. 0 .AND. verbose) then print*,'Warning: ',cmd(1:XTRIM(cmd)), @@ -78,7 +85,7 @@ program main ! call MPI_Info_set(info, "romio_pvfs2_posix_write","enable",ierr) cmode = IOR(NF_CLOBBER, NF_64BIT_DATA) - err = nfmpi_create(MPI_COMM_WORLD, filename, cmode, + err = nfmpi_create(MPI_COMM_WORLD, out_path, cmode, + info, ncid) call check(err, 'Error at nfmpi_create ') @@ -251,10 +258,19 @@ program main err = nfmpi_close(ncid) call check(err, 'Error at nfmpi_close ') + timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, + + MPI_DOUBLE_PRECISION, MPI_MAX, + + MPI_COMM_WORLD, ierr) + if (rank .EQ. 0) then + if (.NOT. keep_files) then + err = nfmpi_delete(out_path, MPI_INFO_NULL) + end if + msg = '*** TESTING F77 '//cmd(1:XTRIM(cmd))// - + ' for bput_varm_real API' - call pass_fail(no_err, msg) + + ' - bput_varm_real API' + call pass_fail(no_err, msg, timing) endif 999 CALL MPI_Finalize(ierr) diff --git a/test/nonblocking/wait_after_indep.c b/test/nonblocking/wait_after_indep.c index 5351a3f7ea..c135fc0cd2 100644 --- a/test/nonblocking/wait_after_indep.c +++ b/test/nonblocking/wait_after_indep.c @@ -26,41 +26,31 @@ #define NX 10 #define NDIMS 2 -int main(int argc, char** argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) { - char filename[256]; int i, j, rank, nprocs, err, nerrs=0; int ncid, varid, dimid[2], req, st; MPI_Offset start[2], count[2], stride[2]; - unsigned char buffer[NY][NX]; + signed char buffer[NY][NX]; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for ncmpi_end_indep_data ", basename(argv[0])); - printf("%-66s ------ ",cmd_str); - free(cmd_str); - } - - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER|NC_64BIT_DATA, - MPI_INFO_NULL, &ncid); + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "Y", NC_UNLIMITED, &dimid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "X", NX*nprocs, &dimid[1]); CHECK_ERR - err = ncmpi_def_var(ncid, "var", NC_UBYTE, NDIMS, dimid, &varid); CHECK_ERR + err = ncmpi_def_var(ncid, "var", NC_BYTE, NDIMS, dimid, &varid); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR for (i=0; i 0) { - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - } - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return nerrs; +} + +int main(int argc, char **argv) { - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "ncmpi_end_indep_data()", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/nonblocking/wrap_runs.sh b/test/nonblocking/wrap_runs.sh index 3fdd5e99c3..9852ea4675 100755 --- a/test/nonblocking/wrap_runs.sh +++ b/test/nonblocking/wrap_runs.sh @@ -16,7 +16,7 @@ outfile=`basename $1` OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -26,8 +26,30 @@ fi unset PNETCDF_HINTS for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi +for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + else + USEMPIO_HINTS="nc_pncio=enable" + fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.nc ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.nc # echo "" @@ -36,7 +58,7 @@ for j in ${safe_modes} ; do echo "" echo "---- testing burst buffering" - export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" + export PNETCDF_HINTS="$PNETCDF_HINTS;nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.bb.nc unset PNETCDF_HINTS ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.bb.nc @@ -48,6 +70,7 @@ for j in ${safe_modes} ; do fi fi done +done rm -f ${OUTDIR}/$outfile.nc rm -f ${OUTDIR}/$outfile.bb.nc diff --git a/test/parallel_run.sh b/test/parallel_run.sh new file mode 100755 index 0000000000..c1d4fcde9a --- /dev/null +++ b/test/parallel_run.sh @@ -0,0 +1,329 @@ +#!/bin/bash +# +# Copyright (C) 2018, Northwestern University and Argonne National Laboratory +# See COPYRIGHT notice in top-level directory. +# + +# Exit immediately if a command exits with a non-zero status. +set -e + +DRY_RUN=no +VERBOSE=no + +exe_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + + saved_PNETCDF_HINTS= + cmd=`basename $1` + if test "x$MIMIC_LUSTRE" = x1 && test "x$cmd" = xncmpidiff ; then + # echo "export MIMIC_STRIPE_SIZE=1048576" + export MIMIC_STRIPE_SIZE=1048576 + saved_PNETCDF_HINTS=$PNETCDF_HINTS + unset PNETCDF_HINTS + fi + + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $MPIRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $MPIRUN $@ + fi + + if test "x$MIMIC_LUSTRE" = x1 && test "x$cmd" = xncmpidiff ; then + # echo "unset MIMIC_STRIPE_SIZE" + unset MIMIC_STRIPE_SIZE + export PNETCDF_HINTS=$saved_PNETCDF_HINTS + fi +} + +seq_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $TESTSEQRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $TESTSEQRUN $@ + fi +} + +VALIDATOR=../../src/utils/ncvalidator/ncvalidator +NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff + +MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` +# echo "MPIRUN = ${MPIRUN}" +# echo "check_PROGRAMS=${check_PROGRAMS}" + +# remove file system type prefix if there is any +OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` + +# let NTHREADS=$1*6-1 +NTHREADS=`expr $1 \* 6 - 1` + +# echo "${LINENO}: PNETCDF_DEBUG = ${PNETCDF_DEBUG}" +if test "x${PNETCDF_DEBUG}" = x1 ; then + safe_modes="0 1" +else + safe_modes="0" +fi + +# prevent user environment setting of PNETCDF_HINTS to interfere +unset PNETCDF_HINTS + +fixed_length=23 + +for i in ${check_PROGRAMS} ; do + if test "$i" = pres_temp_4D_rd ; then + # running pres_temp_4D_rd is a part of pres_temp_4D_wr + continue + fi + if test "$i" = tst_io ; then + # this is designed to run 1 process + continue + fi + if test "$i" = tst_version ; then + # this program read only and creates no output file + exe_cmd ./$i + continue + fi + if test "$i" = tst_open_cdf5 ; then + # this program read only and creates no output file + exe_cmd ./$i ${srcdir}/bad_begin.nc5 + continue + fi + if test "$i" = tst_corrupt ; then + # this program read only and creates no output file + exe_cmd ./$i ${srcdir} + continue + fi + + for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + safe_hint=" SAFE" + else + safe_hint="NOSAFE" + fi + OUT_PREFIX="${TESTOUTDIR}/$i" + + for no_indep_rw in true false ; do + no_indep_rw_hint="romio_no_indep_rw=$no_indep_rw" + + for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + DRIVER_OUT_FILE="${OUT_PREFIX}.mpio" + driver_hint=" MPIO" + else + USEMPIO_HINTS="nc_pncio=enable" + DRIVER_OUT_FILE="${OUT_PREFIX}.pncio" + driver_hint="PNCIO" + fi + for intra_aggr in 0 1 ; do + if test "$intra_aggr" = 1 ; then + INA_HINTS="nc_num_aggrs_per_node=2" + INA_OUT_FILE="${DRIVER_OUT_FILE}.ina" + ina_hint=" INA" + else + INA_HINTS="nc_num_aggrs_per_node=0" + INA_OUT_FILE="${DRIVER_OUT_FILE}" + ina_hint="NOINA" + fi + + OUT_FILE=$INA_OUT_FILE + TEST_OPTS="$safe_hint $driver_hint $ina_hint" + + if [[ "$i" == *"vard"* ]] ; then + if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then + # vard APIs are not supported when using PNCIO + continue + fi + fi + + # More rigorous tests using a small moving chunk size + PNETCDF_HINTS="nc_data_move_chunk_size=100" + + PNETCDF_HINTS="$no_indep_rw_hint;$PNETCDF_HINTS" + + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS;$PNETCDF_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + if test "x$INA_HINTS" != x ; then + PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS" + fi + if test "x$MIMIC_LUSTRE" != x1 ; then + PNETCDF_HINTS="cb_nodes=2;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + fi + + if test "$i" = tst_pthread ; then + # each MPI process created 6 threads + exe_cmd ./$i ${OUT_FILE}.nc + for k in `seq 0 ${NTHREADS}` ; do + seq_cmd ${VALIDATOR} -q ${OUT_FILE}.nc.$k + rm -f ${OUTDIR}/tst_pthread.nc.$k + done + continue + elif test "$i" = pres_temp_4D_wr ; then + exe_cmd ./$i ${OUT_FILE}.nc + exe_cmd ./pres_temp_4D_rd ${OUT_FILE}.nc + elif test "$i" = pres_temp_4D_rd ; then + continue + elif test "$i" = test_inq_format ; then + exe_cmd ./$i ${srcdir} + continue + elif test "$i" = "tst_cdl_hdr_parser" ; then + exe_cmd ./$i -q -o ${OUT_FILE}.nc ${srcdir}/cdl_header.txt + continue + elif test "$i" = mcoll_perf ; then + exe_cmd ./$i ${OUT_FILE} + else + exe_cmd ./$i ${OUT_FILE}.nc + fi + + # put_all_kinds and iput_all_kinds output 3 files + if test "$i" = put_all_kinds -o "$i" = iput_all_kinds ; then + for k in 1 2 5 ; do + seq_cmd ${VALIDATOR} -q ${OUT_FILE}.nc$k + done + elif test "$i" = mcoll_perf ; then + for j in `seq 0 9` ; do + ext="2.4.$j.nc" + seq_cmd ${VALIDATOR} -q ${OUT_FILE}.$ext + done + else + seq_cmd ${VALIDATOR} -q ${OUT_FILE}.nc + fi + + if test "x${ENABLE_BURST_BUFFER}" = x1 ; then + # echo "${LINENO}: ---- test burst buffering feature" + saved_PNETCDF_HINTS=${PNETCDF_HINTS} + export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" + if test "$i" = mcoll_perf ; then + exe_cmd ./$i ${OUT_FILE}.bb + else + exe_cmd ./$i ${OUT_FILE}.bb.nc + fi + export PNETCDF_HINTS=${saved_PNETCDF_HINTS} + + # put_all_kinds and iput_all_kinds output 3 files + if test "$i" = put_all_kinds -o "$i" = iput_all_kinds ; then + for k in 1 2 5 ; do + seq_cmd ${VALIDATOR} -q ${OUT_FILE}.bb.nc$k + exe_cmd ${NCMPIDIFF} -q ${OUT_FILE}.nc$k ${OUT_FILE}.bb.nc$k + done + continue + elif test "$i" = mcoll_perf ; then + for j in `seq 0 9` ; do + ext="2.4.$j.nc" + bb_ext="bb.2.4.$j.nc" + seq_cmd ${VALIDATOR} -q ${OUT_FILE}.$bb_ext + exe_cmd ${NCMPIDIFF} -q ${OUT_FILE}.$ext ${OUT_FILE}.$bb_ext + done + continue + else + seq_cmd ${VALIDATOR} -q ${OUT_FILE}.bb.nc + fi + + # compare file header only for large file tests + DIFF_OPT="-q" + if test "$i" = last_large_var || + test "$i" = dim_cdf12 || + test "$i" = tst_cdl_hdr_parser || + test "$i" = bigrecords || + test "$i" = high_dim_var || + test "$i" = large_attr || + test "$i" = large_coalesce || + test "$i" = large_dims_vars_attrs || + test "$i" = large_files || + test "$i" = large_header || + test "$i" = large_reqs || + test "$i" = large_var || + test "$i" = tst_cdf5_begin || + test "$i" = tst_flarge || + test "$i" = tst_hash_large_ndims || + test "$i" = tst_hash_large_ngattrs || + test "$i" = tst_hash_large_nvars ; then + DIFF_OPT+=" -h" + fi + exe_cmd ${NCMPIDIFF} $DIFF_OPT $OUT_FILE.nc $OUT_FILE.bb.nc + fi + + if test "x${ENABLE_NETCDF4}" = x1 ; then + if test "$i" = tst_grow_data ; then + continue + fi + exe_cmd ./$i ${OUT_FILE}.nc4 4 + # Validator does not support nc4 + fi + done # intra_aggr + done # mpiio_mode + + if [[ "$i" == *"vard"* ]] ; then + continue + fi + + DIFF_OPT="-q" + if test "$i" = last_large_var || + test "$i" = dim_cdf12 || + test "$i" = tst_cdl_hdr_parser || + test "$i" = bigrecords || + test "$i" = high_dim_var || + test "$i" = large_attr || + test "$i" = large_coalesce || + test "$i" = large_dims_vars_attrs || + test "$i" = large_files || + test "$i" = large_header || + test "$i" = large_reqs || + test "$i" = large_var || + test "$i" = tst_cdf5_begin || + test "$i" = tst_flarge || + test "$i" = tst_hash_large_ndims || + test "$i" = tst_hash_large_ngattrs || + test "$i" = tst_hash_large_nvars ; then + DIFF_OPT+=" -h" + fi + if test "$i" = test_inq_format ; then + continue + fi + if test "$i" = put_all_kinds || test "$i" = iput_all_kinds ; then + for j in 1 2 5; do + exe_cmd $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc$j $OUT_PREFIX.mpio.ina.nc$j + exe_cmd $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc$j $OUT_PREFIX.pncio.nc$j + exe_cmd $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.pncio.nc$j $OUT_PREFIX.pncio.ina.nc$j + done + elif test "$i" = tst_pthread ; then + for j in `seq 0 ${NTHREADS}` ; do + exe_cmd $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.mpio.ina.nc.$j + exe_cmd $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.pncio.nc.$j + exe_cmd $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.pncio.nc.$j $OUT_PREFIX.pncio.ina.nc.$j + done + elif test "$i" = mcoll_perf ; then + for j in `seq 0 9` ; do + ext="2.4.$j.nc" + exe_cmd $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.$ext $OUT_PREFIX.mpio.ina.$ext + exe_cmd $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.$ext $OUT_PREFIX.pncio.$ext + exe_cmd $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.pncio.$ext $OUT_PREFIX.pncio.ina.$ext + done + else + exe_cmd $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.mpio.ina.nc + exe_cmd $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.nc + exe_cmd $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.pncio.nc $OUT_PREFIX.pncio.ina.nc + fi + + done # no_indep_rw + done # safe_modes + + if test "x$i" = xpres_temp_4D_wr ; then + rm -f ${OUTDIR}/pres_temp_4D*.nc* + else + rm -f ${OUTDIR}/$i*nc* + fi +done # check_PROGRAMS + diff --git a/test/subfile/Makefile.am b/test/subfile/Makefile.am index ed268755cd..ab69f3c461 100644 --- a/test/subfile/Makefile.am +++ b/test/subfile/Makefile.am @@ -25,30 +25,35 @@ if DECL_MPI_OFFSET # AM_FCFLAGS += $(FC_DEFINE)HAVE_DECL_MPI_OFFSET endif -TESTPROGRAMS = test_subfile - -check_PROGRAMS = $(TESTPROGRAMS) +check_PROGRAMS = test_subfile TESTS_ENVIRONMENT = export SED="$(SED)"; TESTS_ENVIRONMENT += export srcdir="$(srcdir)"; TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; TESTS_ENVIRONMENT += export TESTMPIRUN="$(TESTMPIRUN)"; -TESTS_ENVIRONMENT += export PNETCDF_DEBUG="$(PNETCDF_DEBUG)"; -TESTS_ENVIRONMENT += export TESTPROGRAMS="$(TESTPROGRAMS)"; TESTS_ENVIRONMENT += export check_PROGRAMS="$(check_PROGRAMS)"; -TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER="$(ENABLE_BURST_BUFFER)"; -TESTS = seq_runs.sh +if PNETCDF_DEBUG + TESTS_ENVIRONMENT += export PNETCDF_DEBUG=1; +endif +if ENABLE_BURST_BUFFER + TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER=1; +endif +if ENABLE_THREAD_SAFE + TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE=1; +endif +if MIMIC_LUSTRE + TESTS_ENVIRONMENT += export MIMIC_LUSTRE=1; +endif + +TESTS = $(check_PROGRAMS) TEST_EXTENSIONS = .sh +LOG_COMPILER = $(srcdir)/seq_runs.sh +SH_LOG_COMPILER = -CLEANFILES = core core.* *.gcda *.gcno *.gcov gmon.out \ - $(TESTOUTDIR)/test_subfile.nc \ - $(TESTOUTDIR)/test_subfile.nc.subfile_0.nc \ - $(TESTOUTDIR)/test_subfile.nc.subfile_1.nc \ - $(TESTOUTDIR)/test_subfile.bb.nc \ - $(TESTOUTDIR)/test_subfile.bb.nc.subfile_0.nc \ - $(TESTOUTDIR)/test_subfile.bb.nc.subfile_1.nc +CLEANFILES = $(TESTOUTDIR)/*.nc $(TESTOUTDIR)/*bb \ + core core.* *.gcda *.gcno *.gcov gmon.out EXTRA_DIST = README.md seq_runs.sh parallel_run.sh diff --git a/test/subfile/parallel_run.sh b/test/subfile/parallel_run.sh index dea9874ade..da174c4a67 100755 --- a/test/subfile/parallel_run.sh +++ b/test/subfile/parallel_run.sh @@ -1,24 +1,30 @@ #!/bin/bash # -# Copyright (C) 2018, Northwestern University and Argonne National Laboratory +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. # # Exit immediately if a command exits with a non-zero status. set -e -VALIDATOR=../../src/utils/ncvalidator/ncvalidator -NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff +DRY_RUN=no +VERBOSE=no -# remove file system type prefix if there is any -OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $MPIRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $MPIRUN $@ + fi +} MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "MPIRUN = ${MPIRUN}" # echo "check_PROGRAMS=${check_PROGRAMS}" -# echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -27,60 +33,24 @@ fi # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS +if test "x$MIMIC_LUSTRE" != x1 ; then + PNETCDF_HINTS="cb_nodes=2" +fi + for i in ${check_PROGRAMS} ; do - for j in ${safe_modes} ; do - for intra_aggr in 0 1 ; do - if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" - else - export PNETCDF_HINTS= - fi - if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" - fi - export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" - ${MPIRUN} ./$i -f ${TESTOUTDIR}/$i.nc -s 2 - # echo "--- validating file ${TESTOUTDIR}/$i.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc - # echo "--- validating file ${TESTOUTDIR}/$i.nc.subfile_0.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc.subfile_0.nc - # echo "--- validating file ${TESTOUTDIR}/$i.nc.subfile_1.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc.subfile_1.nc - # echo "" + exe_name=`basename $i` + + for j in ${safe_modes} ; do - # skip burst buffering test, as it has not supported subfiling yet - continue + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi - if test "x${ENABLE_BURST_BUFFER}" = x1 ; then - # echo "---- test burst buffering feature" - saved_PNETCDF_HINTS=${PNETCDF_HINTS} - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" - ${MPIRUN} ./$i -f ${TESTOUTDIR}/$i.bb.nc -s 2 - export PNETCDF_HINTS=${saved_PNETCDF_HINTS} + run_cmd ./$i -q -o ${TESTOUTDIR}/${exe_name}.nc - # echo "--- validating file ${TESTOUTDIR}/$i.bb.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.nc - # echo "--- validating file ${TESTOUTDIR}/$i.bb.nc.subfile_0.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.nc.subfile_0.nc - # echo "--- validating file ${TESTOUTDIR}/$i.bb.nc.subfile_1.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.nc.subfile_1.nc + done # safe_modes - # echo "--- ncmpidiff $i.nc $i.bb.nc ---" - ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.nc ${TESTOUTDIR}/$i.bb.nc - # echo "--- ncmpidiff $i.nc.subfile_0.nc $i.bb.nc.subfile_0.nc ---" - ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.nc.subfile_0.nc ${TESTOUTDIR}/$i.bb.nc.subfile_0.nc - # echo "--- ncmpidiff $i.nc.subfile_1.nc $i.bb.nc.subfile_1.nc ---" - ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.nc.subfile_1.nc ${TESTOUTDIR}/$i.bb.nc.subfile_1.nc - fi - done - done - rm -f ${OUTDIR}/$i.nc - rm -f ${OUTDIR}/$i.nc.subfile_0.nc - rm -f ${OUTDIR}/$i.nc.subfile_1.nc - rm -f ${OUTDIR}/$i.bb.nc - rm -f ${OUTDIR}/$i.bb.nc.subfile_0.nc - rm -f ${OUTDIR}/$i.bb.nc.subfile_1.nc -done +done # check_PROGRAMS diff --git a/test/subfile/seq_runs.sh b/test/subfile/seq_runs.sh index 46a34d3842..205d1fb396 100755 --- a/test/subfile/seq_runs.sh +++ b/test/subfile/seq_runs.sh @@ -1,63 +1,44 @@ -#!/bin/sh +#!/bin/bash # -# Copyright (C) 2003, Northwestern University and Argonne National Laboratory +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. # # Exit immediately if a command exits with a non-zero status. set -e -VALIDATOR=../../src/utils/ncvalidator/ncvalidator -NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff +DRY_RUN=no +VERBOSE=no -# remove file system type prefix if there is any -OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $TESTSEQRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $TESTSEQRUN $@ + fi +} -# echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" fi +exe_name=`basename $1` + # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS -for i in ${TESTPROGRAMS} ; do - for j in ${safe_modes} ; do - export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" - ${TESTSEQRUN} ./$i -f ${TESTOUTDIR}/$i.nc -s 2 - - # echo "--- validating file ${TESTOUTDIR}/$i.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc - # echo "--- validating file ${TESTOUTDIR}/$i.nc.subfile_0.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc.subfile_0.nc - # echo "" - - # skip burst buffering test, as it has not supported subfiling yet - continue - - if test "x${ENABLE_BURST_BUFFER}" = x1 ; then - echo "" - echo "---- testing burst buffering" - - export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" - ${TESTSEQRUN} ./$i -f ${TESTOUTDIR}/$i.bb.nc -s 2 - unset PNETCDF_HINTS - - # echo "--- validating file ${TESTOUTDIR}/$i.bb.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.nc - # echo "--- validating file ${TESTOUTDIR}/$i.bb.nc.subfile_0.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.nc.subfile_0.nc - - # echo "--- ncmpidiff $i.nc.subfile_0.nc $i.bb.nc.subfile_0.nc ---" - ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.nc.subfile_0.nc ${TESTOUTDIR}/$i.bb.nc.subfile_0.nc - fi - done - rm -f ${OUTDIR}/$i.nc - rm -f ${OUTDIR}/$i.nc.subfile_0.nc - rm -f ${OUTDIR}/$i.bb.nc - rm -f ${OUTDIR}/$i.bb.nc.subfile_0.nc -done +for j in ${safe_modes} ; do + + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi + + run_cmd ./$1 -q -o ${TESTOUTDIR}/${exe_name}.nc + +done # safe_modes diff --git a/test/subfile/test_subfile.c b/test/subfile/test_subfile.c index 356be8df94..85e19aa87b 100644 --- a/test/subfile/test_subfile.c +++ b/test/subfile/test_subfile.c @@ -25,93 +25,36 @@ Array size 128^3. For other array sizes, change array_of_gsizes below. */ -int main(int argc, char **argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) { - int opt, verbose=0; extern char *optarg; extern int optind; - int i, j, array_of_gsizes[3]; - int nprocs, len, **buf, rank; - MPI_Offset bufcount; - int array_of_psizes[3]; - int err; - MPI_Offset array_of_starts[3]; - char *fbasename=NULL; char dimname[20], varname[20]; - int ncid, dimids0[3], rank_dim[3], *varid=NULL; - MPI_Info info=MPI_INFO_NULL, info_used=MPI_INFO_NULL; - MPI_Offset **starts_list, **count_list; + int i, j, err, nerrs=0, verbose=0, nprocs, rank, ncid, *varid=NULL; + int ndims=3, ngatts, unlimdimid, dimids0[3], rank_dim[3]; + int num_files, **buf, array_of_psizes[3], array_of_gsizes[3]; + MPI_Offset bufcount, array_of_starts[3], **starts_list, **count_list; MPI_Offset *bufcount_list; - int ndims=3, nvars=1, ngatts, unlimdimid; MPI_Datatype *datatype_list; - int length = 8; - double stim, write_tim, new_write_tim, write_bw; - double read_tim, new_read_tim, read_bw; - double open_tim, new_open_tim; + MPI_Info info_used=MPI_INFO_NULL; + double stim, write_tim, new_write_tim, write_bw, read_bw; + double open_tim, new_open_tim, read_tim, new_read_tim; double close_tim, new_close_tim; + int num_sf = 2; int par_dim_id = 0; /* default is 0 */ int do_read = 0; - int nerrs=0; + int nvars = 1; + int length = 8; - MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - /* process 0 takes the file name as a command-line argument and - broadcasts it to other processes */ - if (rank == 0) { - while ((opt = getopt(argc, argv, "f:s:p:n:l:r")) != EOF) { - switch (opt) { - case 'f': fbasename = optarg; - break; - case 's': num_sf = (int)strtol(optarg,NULL,10); - break; - case 'r': do_read = 1; - break; - case 'p': par_dim_id = (int)strtol(optarg,NULL,10); - break; - case 'n': nvars = (int)strtol(optarg,NULL,10); - break; - case 'l': length = (int)strtol(optarg,NULL,10); - break; - default: - break; - } - } - if (fbasename == NULL) { - fprintf(stderr, "\n*# Usage: test_subfile -f pathname -s num_sf -p par_dim_id \n\n"); - nerrs++; - } - } - MPI_Bcast(&nerrs, 1, MPI_INT, 0, MPI_COMM_WORLD); - if (nerrs > 0) { - MPI_Finalize(); - return 1; - } - - if (rank == 0) { - len = (fbasename == NULL) ? 0 : (int)strlen(fbasename); - MPI_Bcast(&len, 1, MPI_INT, 0, MPI_COMM_WORLD); - } - else { - MPI_Bcast(&len, 1, MPI_INT, 0, MPI_COMM_WORLD); - fbasename = (char *) malloc(len+1); - } - MPI_Bcast(fbasename, len+1, MPI_CHAR, 0, MPI_COMM_WORLD); - MPI_Bcast(&num_sf, 1, MPI_INT, 0, MPI_COMM_WORLD); - MPI_Bcast(&par_dim_id, 1, MPI_INT, 0, MPI_COMM_WORLD); - MPI_Bcast(&nvars, 1, MPI_INT, 0, MPI_COMM_WORLD); - MPI_Bcast(&do_read, 1, MPI_INT, 0, MPI_COMM_WORLD); - MPI_Bcast(&length, 1, MPI_INT, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for subfiling", basename(argv[0])); - printf("%-66s ------ ", cmd_str); - free(cmd_str); - } - array_of_gsizes[0] = array_of_gsizes[1] = array_of_gsizes[2] = length; buf = (int **)malloc(sizeof(int*) * nvars); @@ -196,7 +139,6 @@ int main(int argc, char **argv) buf[i][j]=rank+1; } - MPI_Info_create(&info); /* set all non-record variable to be subfiled */ char tmp[10]; sprintf(tmp, "%d", num_sf); @@ -206,8 +148,12 @@ int main(int argc, char **argv) if (do_read == 1) goto read; stim = MPI_Wtime(); - err = ncmpi_create(MPI_COMM_WORLD, fbasename, NC_CLOBBER|NC_64BIT_DATA, - info, &ncid); + + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR open_tim = MPI_Wtime() - stim; @@ -333,7 +279,7 @@ int main(int argc, char **argv) goto end; read: - err = ncmpi_open(MPI_COMM_WORLD, fbasename, NC_NOWRITE, info, &ncid); + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR stim = MPI_Wtime(); @@ -369,7 +315,6 @@ int main(int argc, char **argv) CHECK_ERR end: - if (info != MPI_INFO_NULL) MPI_Info_free(&info); if (info_used != MPI_INFO_NULL) MPI_Info_free(&info_used); for (i=0; i 0) free(fbasename); - MPI_Offset malloc_size, sum_size; int nfiles, ncids[10]; /* NULL argument test */ @@ -397,22 +340,47 @@ int main(int argc, char **argv) CHECK_ERR if (nfiles > 0) printf("nfiles %d still opened\n",nfiles); - /* check for any PnetCDF internal malloc residues */ - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - } + /* open the subfiles to validate the file format */ + MPI_Barrier(MPI_COMM_WORLD); + if (rank > 0) goto fn_exit; - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); + num_files = (nprocs < num_sf) ? 1 : num_sf; + for (i=0; i 0); + + return err; } diff --git a/test/test_installed/README.md b/test/test_installed/README.md index 62987cfd76..37b57dd17e 100644 --- a/test/test_installed/README.md +++ b/test/test_installed/README.md @@ -109,12 +109,6 @@ distribution. All test programs are designed to run on 4 MPI processes. cc -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib test_fillvalue.o testutils.o -lpnetcdf -o test_fillvalue cc -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -c ../testcases/test_get_varn.c cc -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib test_get_varn.o testutils.o -lpnetcdf -o test_get_varn - cc -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -c ../testcases/test_vard.c - cc -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib test_vard.o testutils.o -lpnetcdf -o test_vard - cc -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -c ../testcases/test_vard_multiple.c - cc -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib test_vard_multiple.o testutils.o -lpnetcdf -o test_vard_multiple - cc -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -c ../testcases/test_vard_rec.c - cc -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib test_vard_rec.o testutils.o -lpnetcdf -o test_vard_rec cc -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -c ../testcases/test_varm.c cc -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib test_varm.o testutils.o -lpnetcdf -o test_varm cc -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -c ../testcases/tst_def_var_fill.c @@ -182,10 +176,6 @@ distribution. All test programs are designed to run on 4 MPI processes. cc -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib transpose2D.o testutils.o -lpnetcdf -o transpose2D cc -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -c ../../examples/C/transpose.c cc -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib transpose.o testutils.o -lpnetcdf -o transpose - cc -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -c ../../examples/C/vard_int.c - cc -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib vard_int.o testutils.o -lpnetcdf -o vard_int - cc -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -c ../../examples/C/vard_mvars.c - cc -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib vard_mvars.o testutils.o -lpnetcdf -o vard_mvars ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -c ../../examples/F77/block_cyclic.f -o block_cyclic.77o ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -o block_cyclic.exe77 block_cyclic.77o utils.o -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib -lpnetcdf ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -c ../../examples/F77/bput_varn_int8.f -o bput_varn_int8.77o @@ -214,8 +204,6 @@ distribution. All test programs are designed to run on 4 MPI processes. ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -o time_var.exe77 time_var.77o utils.o -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib -lpnetcdf ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -c ../../examples/F77/transpose.f -o transpose.77o ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -o transpose.exe77 transpose.77o utils.o -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib -lpnetcdf - ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -c ../../examples/F77/vard_int.f -o vard_int.77o - ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -o vard_int.exe77 vard_int.77o utils.o -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib -lpnetcdf ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -c ../../examples/F90/block_cyclic.f90 -o block_cyclic.90o ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -o block_cyclic.exe90 block_cyclic.90o utils.o -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib -lpnetcdf ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -c ../../examples/F90/column_wise.f90 -o column_wise.90o @@ -238,8 +226,6 @@ distribution. All test programs are designed to run on 4 MPI processes. ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -o put_varn_real.exe90 put_varn_real.90o utils.o -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib -lpnetcdf ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -c ../../examples/F90/transpose.f90 -o transpose.90o ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -o transpose.exe90 transpose.90o utils.o -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib -lpnetcdf - ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -c ../../examples/F90/vard_int.f90 -o vard_int.90o - ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -o vard_int.exe90 vard_int.90o utils.o -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib -lpnetcdf ``` @@ -298,9 +284,6 @@ distribution. All test programs are designed to run on 4 MPI processes. *** TESTING C test_erange for checking for NC_ERANGE ------ pass *** TESTING C test_fillvalue for _FillValue for NC_GLOBAL ------ pass *** TESTING C test_get_varn for get_varn ------ pass - *** TESTING C test_vard for vard put and get ------ pass - *** TESTING C test_vard_multiple for vard to 2 variables ------ pass - *** TESTING C test_vard_rec for vard put on record var ------ pass *** TESTING C test_varm for get/put varm ------ pass *** TESTING C tst_def_var_fill for def_var_fill ------ pass *** TESTING C tst_dimsizes for defining max dimension sizes ------ pass @@ -336,8 +319,6 @@ distribution. All test programs are designed to run on 4 MPI processes. *** TESTING C examples/C/time_var ------ pass *** TESTING C examples/C/transpose2D ------ pass *** TESTING C examples/C/transpose ------ pass - *** TESTING C examples/C/vard_int ------ pass - *** TESTING C examples/C/vard_mvars ------ pass *** TESTING F77 examples/F77/block_cyclic.exe77 ------ pass *** TESTING F77 examples/F77/bput_varn_int8.exe77 ------ pass *** TESTING F77 examples/F77/column_wise.exe77 ------ pass @@ -352,7 +333,6 @@ distribution. All test programs are designed to run on 4 MPI processes. *** TESTING F77 examples/F77/put_varn_real.exe77 ------ pass *** TESTING F77 examples/F77/time_var.exe77 ------ pass *** TESTING F77 examples/F77/transpose.exe77 ------ pass - *** TESTING F77 examples/F77/vard_int.exe77 ------ pass *** TESTING F90 examples/F90/block_cyclic.exe90 ------ pass *** TESTING F90 examples/F90/column_wise.exe90 ------ pass *** TESTING F90 examples/F90/fill_mode.exe90 ------ pass @@ -364,7 +344,6 @@ distribution. All test programs are designed to run on 4 MPI processes. *** TESTING F90 examples/F90/put_varn_int.exe90 ------ pass *** TESTING F90 examples/F90/put_varn_real.exe90 ------ pass *** TESTING F90 examples/F90/transpose.exe90 ------ pass - *** TESTING F90 examples/F90/vard_int.exe90 ------ pass Total number of tested programs: 105 diff --git a/test/test_installed/makefile b/test/test_installed/makefile index b643b4925f..87b9a9a185 100644 --- a/test/test_installed/makefile +++ b/test/test_installed/makefile @@ -61,9 +61,6 @@ testcases_src = ../testcases/add_var.c \ ../testcases/test_erange.c \ ../testcases/test_fillvalue.c \ ../testcases/test_get_varn.c \ - ../testcases/test_vard.c \ - ../testcases/test_vard_multiple.c \ - ../testcases/test_vard_rec.c \ ../testcases/test_varm.c \ ../testcases/tst_def_var_fill.c \ ../testcases/tst_del_attr.c \ @@ -109,10 +106,7 @@ examples_C_src = ../../examples/C/block_cyclic.c \ ../../examples/C/put_varn_int.c \ ../../examples/C/time_var.c \ ../../examples/C/transpose2D.c \ - ../../examples/C/transpose.c \ - ../../examples/C/vard_bottom.c \ - ../../examples/C/vard_int.c \ - ../../examples/C/vard_mvars.c + ../../examples/C/transpose.c EXAMPLE_PROGS += $(examples_C_src:../../examples/C/%.c=%) %.o: ../../examples/C/%.c $(CC) $(CFLAGS) -c $< @@ -146,8 +140,7 @@ examples_F77_src = ../../examples/F77/block_cyclic.f \ ../../examples/F77/put_varn_int.f \ ../../examples/F77/put_varn_real.f \ ../../examples/F77/time_var.f \ - ../../examples/F77/transpose.f \ - ../../examples/F77/vard_int.f + ../../examples/F77/transpose.f EXAMPLE_PROGS += $(examples_F77_src:../../examples/F77/%.f=%.exe77) %.77o: ../../examples/F77/%.f @@ -163,8 +156,7 @@ examples_F90_src = ../../examples/F90/block_cyclic.f90 \ ../../examples/F90/put_var.f90 \ ../../examples/F90/put_varn_int.f90 \ ../../examples/F90/put_varn_real.f90 \ - ../../examples/F90/transpose.f90 \ - ../../examples/F90/vard_int.f90 + ../../examples/F90/transpose.f90 EXAMPLE_PROGS += $(examples_F90_src:../../examples/F90/%.f90=%.exe90) all: env_check testutils.o utils.o $(TEST_PROGS) $(EXAMPLE_PROGS) batch.sh interactive.sh diff --git a/test/testcases/Makefile.am b/test/testcases/Makefile.am index 5c8a8a25c0..b0266dc8d2 100644 --- a/test/testcases/Makefile.am +++ b/test/testcases/Makefile.am @@ -54,51 +54,61 @@ if NAGFORT AM_FCFLAGS += -w=uparam endif -TESTPROGRAMS = file_create_open \ - ncmpi_vars_null_stride \ - vectors \ - collective_error \ - test_varm \ - alignment_test \ - flexible \ - flexible2 \ - flexible_varm \ - nonblocking \ - noclobber \ - record \ - inq_num_vars \ - varn_int \ - modes \ - one_record \ - inq_recsize \ - test_vard \ - test_vard_rec \ - test_vard_multiple \ - varn_contig \ - ivarn \ - check_striping \ - add_var \ - buftype_free \ - last_large_var \ - check_type \ - test_erange \ - scalar \ - null_args \ - tst_dimsizes \ - mix_collectives \ - large_var_cdf5 \ - tst_max_var_dims \ - tst_info \ - tst_vars_fill \ - tst_def_var_fill \ - test_fillvalue \ - error_precedence \ - tst_free_comm \ - flexible_var \ - test_get_varn \ - tst_del_attr \ - tst_redefine \ - tst_grow_header +check_PROGRAMS = file_create_open \ + ncmpi_vars_null_stride \ + vectors \ + collective_error \ + test_varm \ + alignment_test \ + flexible \ + flexible2 \ + flexible_varm \ + nonblocking \ + noclobber \ + record \ + inq_num_vars \ + varn_int \ + modes \ + one_record \ + inq_recsize \ + test_vard \ + test_vard_rec \ + test_vard_multiple \ + varn_contig \ + ivarn \ + check_striping \ + add_var \ + buftype_free \ + last_large_var \ + check_type \ + test_erange \ + scalar \ + null_args \ + tst_dimsizes \ + mix_collectives \ + large_var_cdf5 \ + tst_max_var_dims \ + tst_info \ + tst_vars_fill \ + tst_def_var_fill \ + test_fillvalue \ + error_precedence \ + tst_free_comm \ + flexible_var \ + test_get_varn \ + tst_del_attr \ + tst_redefine \ + tst_grow_header \ + tst_varn_var1 \ + tst_multi_redefine \ + tst_grow_data \ + tst_inq_header_size \ + tst_data_move \ + put_all_kinds \ + redef1 \ + iput_all_kinds \ + tst_version \ + tst_chunk_nonblocking M4_SRCS = put_all_kinds.m4 \ erange_fill.m4 \ @@ -114,17 +124,21 @@ nodist_iput_all_kinds_SOURCES = iput_all_kinds.c nodist_error_precedence_SOURCES = error_precedence.c nodist_null_args_SOURCES = null_args.c +if ENABLE_THREAD_SAFE + check_PROGRAMS += tst_pthread +endif + if TEST_SYMLINK - TESTPROGRAMS += tst_symlink + check_PROGRAMS += tst_symlink endif if TEST_LARGE_COUNT - TESTPROGRAMS += flexible_large_count + check_PROGRAMS += flexible_large_count endif if ENABLE_ERANGE_FILL M4FLAGS += -DERANGE_FILL - TESTPROGRAMS += erange_fill + check_PROGRAMS += erange_fill endif M4FLAGS += -I${top_srcdir}/m4 @@ -138,7 +152,7 @@ $(M4_SRCS:.m4=.c): Makefile set -e; cd ../common && $(MAKE) $(MFLAGS) tests if HAS_FORTRAN - TESTPROGRAMS += varn_intf \ + check_PROGRAMS += varn_intf \ attrf \ buftype_freef \ put_parameter \ @@ -152,10 +166,10 @@ if HAS_FORTRAN test_vardf_SOURCES = test_vardf.F flexible_api_SOURCES = flexible_api.f if HAVE_MPI_MOD - TESTPROGRAMS += inq_num_varsf \ - inq_recsizef \ - test_vardf90 \ - varn_real + check_PROGRAMS += inq_num_varsf \ + inq_recsizef \ + test_vardf90 \ + varn_real inq_num_varsf_SOURCES = inq_num_varsf.f90 inq_recsizef_SOURCES = inq_recsizef.f90 @@ -164,18 +178,9 @@ if HAVE_MPI_MOD endif endif -# all programs in TESTPROGRAMS will be run by wrap_runs.sh -# others in check_PROGRAMS but not in TESTPROGRAMS will be run by seq_runs.sh -# Those are the ones need special treatment -check_PROGRAMS = $(TESTPROGRAMS) \ - put_all_kinds \ - redef1 \ - iput_all_kinds \ - tst_version - # autimake 1.11.3 has not yet implemented AM_TESTS_ENVIRONMENT # For newer versions, we can use AM_TESTS_ENVIRONMENT instead -# AM_TESTS_ENVIRONMENT = TESTPROGRAMS="$(TESTPROGRAMS)" ; export TESTPROGRAMS; +# AM_TESTS_ENVIRONMENT = check_PROGRAMS="$(check_PROGRAMS)" ; export check_PROGRAMS; # AM_TESTS_ENVIRONMENT += TESTSEQRUN="$(TESTSEQRUN)" ; export TESTSEQRUN; # AM_TESTS_ENVIRONMENT += TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)" ; export TESTOUTDIR; TESTS_ENVIRONMENT = export SED="$(SED)"; @@ -183,44 +188,31 @@ TESTS_ENVIRONMENT += export srcdir="$(srcdir)"; TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)"; TESTS_ENVIRONMENT += export TESTSEQRUN="$(TESTSEQRUN)"; TESTS_ENVIRONMENT += export TESTMPIRUN="$(TESTMPIRUN)"; -TESTS_ENVIRONMENT += export PNETCDF_DEBUG="$(PNETCDF_DEBUG)"; -TESTS_ENVIRONMENT += export TESTPROGRAMS="$(TESTPROGRAMS)"; TESTS_ENVIRONMENT += export check_PROGRAMS="$(check_PROGRAMS)"; -TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER="$(ENABLE_BURST_BUFFER)"; -TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE="$(ENABLE_THREAD_SAFE)"; - -TESTS = $(TESTPROGRAMS) seq_runs.sh -TEST_EXTENSIONS = .sh -LOG_COMPILER = $(srcdir)/wrap_runs.sh -SH_LOG_COMPILER = - -NC_FILES = $(check_PROGRAMS:%=$(TESTOUTDIR)/%.nc) \ - $(check_PROGRAMS:%=$(TESTOUTDIR)/%.nc1) \ - $(check_PROGRAMS:%=$(TESTOUTDIR)/%.nc2) \ - $(check_PROGRAMS:%=$(TESTOUTDIR)/%.nc3) \ - $(check_PROGRAMS:%=$(TESTOUTDIR)/%.nc4) \ - $(check_PROGRAMS:%=$(TESTOUTDIR)/%.nc5) \ - $(check_PROGRAMS:%=$(TESTOUTDIR)/%.bb.nc) \ - $(check_PROGRAMS:%=$(TESTOUTDIR)/%.bb.nc1) \ - $(check_PROGRAMS:%=$(TESTOUTDIR)/%.bb.nc2) \ - $(check_PROGRAMS:%=$(TESTOUTDIR)/%.bb.nc3) \ - $(check_PROGRAMS:%=$(TESTOUTDIR)/%.bb.nc4) \ - $(check_PROGRAMS:%=$(TESTOUTDIR)/%.bb.nc5) +if PNETCDF_DEBUG + TESTS_ENVIRONMENT += export PNETCDF_DEBUG=1; +endif +if ENABLE_BURST_BUFFER + TESTS_ENVIRONMENT += export ENABLE_BURST_BUFFER=1; +endif if ENABLE_THREAD_SAFE - check_PROGRAMS += tst_pthread + TESTS_ENVIRONMENT += export ENABLE_THREAD_SAFE=1; +endif +if MIMIC_LUSTRE + TESTS_ENVIRONMENT += export MIMIC_LUSTRE="$(MIMIC_LUSTRE)"; endif -BURST_BUFFER_FILES = $(NC_FILES:=_*.meta) $(NC_FILES:=_*.data) +TESTS = $(check_PROGRAMS) +TEST_EXTENSIONS = .sh +LOG_COMPILER = $(srcdir)/seq_runs.sh +SH_LOG_COMPILER = -CLEANFILES = $(M4_SRCS:.m4=.c) core core.* *.gcda *.gcno *.gcov gmon.out \ - $(TESTOUTDIR)/redef1.nc $(TESTOUTDIR)/redef1.bb.nc \ - $(TESTOUTDIR)/redef2.nc \ - $(TESTOUTDIR)/tst_pthread.nc.* $(TESTOUTDIR)/testfile.nc* \ - $(NC_FILES) +CLEANFILES = $(M4_SRCS:.m4=.c) \ + $(TESTOUTDIR)/*.nc $(TESTOUTDIR)/*bb \ + core core.* *.gcda *.gcno *.gcov gmon.out -EXTRA_DIST = $(M4_SRCS) seq_runs.sh redef-good.ncdump \ - wrap_runs.sh parallel_run.sh +EXTRA_DIST = $(M4_SRCS) seq_runs.sh parallel_run.sh # Some of these tests are designed to run on one process, # Run them on 4 processes to see if they can handle well diff --git a/test/testcases/add_var.c b/test/testcases/add_var.c index 002942014f..8fb899c4e0 100644 --- a/test/testcases/add_var.c +++ b/test/testcases/add_var.c @@ -29,26 +29,23 @@ #include -static int -tst_fmt(char *filename, int cmode) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - char fname[256], var_name[256]; + char var_name[256]; int i, nvars, err, nerrs=0; int ncid, varid, dimid[2]; MPI_Offset prev_off; - if (cmode == 0) sprintf(fname,"%s",filename); - else if (cmode & NC_64BIT_OFFSET) sprintf(fname,"%s%d",filename,2); - else if (cmode & NC_64BIT_DATA) sprintf(fname,"%s%d",filename,5); - else if (cmode & NC_NETCDF4) { - if (cmode & NC_CLASSIC_MODEL) - sprintf(fname,"%s%d",filename,4); - else - sprintf(fname,"%s%d",filename,3); - } + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR - cmode |= NC_CLOBBER; - err = ncmpi_create(MPI_COMM_WORLD, fname, cmode, MPI_INFO_NULL, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR /* define dimensions */ err = ncmpi_def_dim(ncid, "dim_1", 5, &dimid[0]); CHECK_ERR @@ -58,6 +55,7 @@ tst_fmt(char *filename, int cmode) for (i=0; i<10; i++) { sprintf(var_name, "var_%d", i); err = ncmpi_def_var(ncid, var_name, NC_INT, 2, dimid, &varid); CHECK_ERR + err = ncmpi_def_var_fill(ncid, varid, 0, NULL); CHECK_ERR } err = ncmpi_enddef(ncid); CHECK_ERR @@ -70,11 +68,13 @@ tst_fmt(char *filename, int cmode) /* add 2 new variables */ err = ncmpi_def_var(ncid, "new_var1", NC_INT, 2, dimid, &varid); CHECK_ERR + err = ncmpi_def_var_fill(ncid, varid, 0, NULL); CHECK_ERR err = ncmpi_def_var(ncid, "new_var2", NC_FLOAT, 2, dimid, &varid); CHECK_ERR + err = ncmpi_def_var_fill(ncid, varid, 0, NULL); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR err = ncmpi_inq_nvars(ncid, &nvars); CHECK_ERR - if (cmode & NC_NETCDF4) { + if (format == NC_FORMAT_NETCDF4 || format == NC_FORMAT_NETCDF4_CLASSIC) { err = ncmpi_inq_varoffset(ncid, 0, &prev_off); EXP_ERR(NC_ENOTSUPPORT) } else { @@ -96,63 +96,27 @@ tst_fmt(char *filename, int cmode) return nerrs; } -int main(int argc, char** argv) { - char filename[256], *hint_value; - int rank, err, nerrs=0, bb_enabled=0; - - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); +int main(int argc, char **argv) { - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for checking offsets of new variables ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + int err; + loop_opts opt; - /* check whether burst buffering is enabled */ - if (inq_env_hint("nc_burst_buf", &hint_value)) { - if (strcasecmp(hint_value, "enable") == 0) bb_enabled = 1; - free(hint_value); - } + MPI_Init(&argc, &argv); - nerrs += tst_fmt(filename, 0); - nerrs += tst_fmt(filename, NC_64BIT_OFFSET); - if (!bb_enabled) { -#ifdef ENABLE_NETCDF4 - nerrs += tst_fmt(filename, NC_NETCDF4); - nerrs += tst_fmt(filename, NC_NETCDF4 | NC_CLASSIC_MODEL); -#endif - } - nerrs += tst_fmt(filename, NC_64BIT_DATA); - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 1; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + err = tst_main(argc, argv, "checking offsets of new variables", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/alignment_test.c b/test/testcases/alignment_test.c index c04a2377cc..ec8a25a078 100644 --- a/test/testcases/alignment_test.c +++ b/test/testcases/alignment_test.c @@ -31,13 +31,20 @@ #include #define NVARS 8 -#define NX 5 +#define NX 70 -static int tst_mode(char *filename, - int mode) +#define TEST_FIXED_VAR +#define TEST_RECORD_VAR + +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info global_info) { int i, j, rank, nprocs, err, verbose=0, nerrs=0; - int ncid, cmode, varid[NVARS], dimid[2], *buf; + int ncid, varid[NVARS], dimid[2], *buf; char str[32]; MPI_Offset start[2], count[2]; MPI_Offset new_var_off[NVARS*2], old_var_off[NVARS*2]; @@ -47,17 +54,20 @@ static int tst_mode(char *filename, MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Info_dup(global_info, &info); + + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERROUT /* define dimension */ err = ncmpi_def_dim(ncid, "Y", NC_UNLIMITED, &dimid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "X", NX*nprocs, &dimid[1]); CHECK_ERR -#define TEST_FIXED_VAR -#define TEST_RECORD_VAR /* Odd numbers are fixed variables, even numbers are record variables */ for (i=0; i 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for alignment ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + loop_opts opt; - nerrs += tst_mode(filename, MODE_COLL); - if (nerrs > 0) goto err_out; - - nerrs += tst_mode(filename, MODE_INDEP); - if (nerrs > 0) goto err_out; - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + MPI_Init(&argc, &argv); -err_out: - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 1; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "alignment hints", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/attrf.f b/test/testcases/attrf.f index a3e4fce108..ac572ec6ee 100644 --- a/test/testcases/attrf.f +++ b/test/testcases/attrf.f @@ -34,8 +34,8 @@ subroutine check(err, message, nerrs) ! It is a good idea to check returned value for possible error if (err .NE. NF_NOERR) then write(6,*) message(1:XTRIM(message)), nfmpi_strerror(err) - msg = '*** TESTING F77 attrf.f for attribute overflow ' - call pass_fail(1, msg) + msg = '*** TESTING F77 attrf.f - attribute overflow ' + call pass_fail(1, msg, 0) nerrs = nerrs + 1 end if end ! subroutine check @@ -51,32 +51,40 @@ program main integer*2 buf_int2 integer*8 buf_int8, one - character(LEN=256) filename, cmd, msg + character(LEN=256) out_path, in_path, cmd, msg integer ncid, err, ierr, nerrs, nprocs, rank, get_args integer*8 malloc_size, sum_size + logical keep_files + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) one = 1 one_flt = 1.0 - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any if (rank .EQ. 0) then - filename = "testfile.nc" - err = get_args(cmd, filename) + out_path = "testfile.nc" + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, + MPI_COMM_WORLD, ierr) + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, + + ierr) + nerrs = 0 cmode = IOR(NF_CLOBBER,NF_64BIT_DATA) - err = nfmpi_create(MPI_COMM_WORLD, filename, cmode, + err = nfmpi_create(MPI_COMM_WORLD, out_path, cmode, + MPI_INFO_NULL, ncid) call check(err, 'In nfmpi_create: ', nerrs) @@ -184,10 +192,18 @@ program main + sum_size, ' bytes yet to be freed' endif + timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, + + MPI_DOUBLE_PRECISION, MPI_MAX, + + MPI_COMM_WORLD, ierr) if (rank .eq. 0) then + if (.NOT. keep_files) then + err = nfmpi_delete(out_path, MPI_INFO_NULL) + end if + msg = '*** TESTING F77 '//cmd(1:XTRIM(cmd))// - + ' for attribute overflow ' - call pass_fail(nerrs, msg) + + ' - attribute overflow ' + call pass_fail(nerrs, msg, timing) endif 999 call MPI_Finalize(ierr) diff --git a/test/testcases/buftype_free.c b/test/testcases/buftype_free.c index 90ffc7645c..714e3583b6 100644 --- a/test/testcases/buftype_free.c +++ b/test/testcases/buftype_free.c @@ -1,8 +1,6 @@ /* * Copyright (C) 2015, Northwestern University and Argonne National Laboratory * See COPYRIGHT notice in top-level directory. - * - * $Id$ */ /* @@ -21,39 +19,32 @@ #include #define NY 4 -#define NX 4 - -/*----< main() >------------------------------------------------------------*/ -int main(int argc, char **argv) { - - char filename[256]; - int i, j, err, ncid, varid[4], dimids[2], req[4], st[4], nerrs=0; - int rank, nprocs, buf[4][(NY+4)*(NX+4)]; - int gsize[2], subsize[2], a_start[2], ghost; +#define NX 100 +#define NVARS 4 +#define NGHOSTS 2 + +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ + int i, j, k, err, ncid, nerrs=0, rank, nprocs; + int varid[NVARS], dimids[2], req[NVARS], st[NVARS], *buf[NVARS]; + int gsize[2], subsize[2], a_start[2], ghost; MPI_Offset start[2], count[2]; - MPI_Datatype buftype[4]; + MPI_Datatype buftype[NVARS]; - MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for free buftype in flexible API ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, MPI_INFO_NULL, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR /* define a 2D array */ err = ncmpi_def_dim(ncid, "Y", NY*nprocs, &dimids[0]); CHECK_ERR @@ -64,49 +55,91 @@ int main(int argc, char **argv) { err = ncmpi_def_var(ncid, "var3", NC_INT, 2, dimids, &varid[3]); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR - /* initialize the contents of the array */ - for (i=0; i<4; i++) for (j=0; j<(NY+4)*(NX+4); j++) buf[i][j] = rank+10; + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* initialize the contents of the array */ start[0] = NY*rank; start[1] = 0; count[0] = NY; count[1] = NX; - err = ncmpi_put_vara_int_all(ncid, varid[0], start, count, buf[0]); CHECK_ERR - err = ncmpi_put_vara_int_all(ncid, varid[1], start, count, buf[1]); CHECK_ERR - err = ncmpi_put_vara_int_all(ncid, varid[2], start, count, buf[2]); CHECK_ERR - err = ncmpi_put_vara_int_all(ncid, varid[3], start, count, buf[3]); CHECK_ERR + for (i=0; i 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); + /* check contents of read buffers */ + for (i=0; i= gsize[0]-ghost || + k < ghost || k >= gsize[1]-ghost) + exp = -1; + else + exp = (int)((j-ghost)*count[1]+(k-ghost) + rank*10); + if (buf[i][j*gsize[1]+k] != exp) { + printf("Error at %d: var %d expect buf[%d][%d] = %d but got %d\n", + __LINE__, i, j, k, exp, buf[i][j*gsize[1]+k]); + nerrs++; + goto err_out; + } + } + } } - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + for (i=0; i 0); +err_out: + return nerrs; } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "free buftype in flexible API", opt, test_io); + + MPI_Finalize(); + + return err; +} diff --git a/test/testcases/buftype_freef.f b/test/testcases/buftype_freef.f index 18e06cc679..7fba21ee9a 100644 --- a/test/testcases/buftype_freef.f +++ b/test/testcases/buftype_freef.f @@ -35,8 +35,8 @@ subroutine check(err, message) ! It is a good idea to check returned value for possible error if (err .NE. NF_NOERR) then write(6,*) message(1:XTRIM(message)), nfmpi_strerror(err) - msg = '*** TESTING F77 buftype_freef.f for flexible API ' - call pass_fail(1, msg) + msg = '*** TESTING F77 buftype_freef.f - flexible API ' + call pass_fail(1, msg, 0) STOP 2 end if end ! subroutine check @@ -50,29 +50,37 @@ program main integer*8 NX, NY PARAMETER(NREQS=4, NX=4, NY=4) - character(LEN=256) filename, cmd, msg, varname, str + character(LEN=256) out_path, in_path, cmd, msg, varname, str integer i, j, err, ierr, nprocs, rank, nerrs, get_args integer ncid, ghost integer buf(64,4), varid(4), dimid(2), req(4), st(4) integer buftype(4), gsize(2), subsize(2), a_start(2) integer*8 start(2), count(2) integer*8 one, malloc_size, sum_size + logical keep_files + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any if (rank .EQ. 0) then - filename = "testfile.nc" - err = get_args(cmd, filename) + out_path = "testfile.nc" + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, + MPI_COMM_WORLD, ierr) + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, + + ierr) + nerrs = 0 ! initialize I/O buffer @@ -83,7 +91,7 @@ program main enddo ! create file, truncate it if exists - err = nfmpi_create(MPI_COMM_WORLD, filename, NF_CLOBBER, + err = nfmpi_create(MPI_COMM_WORLD, out_path, NF_CLOBBER, + MPI_INFO_NULL, ncid) call check(err, 'In nfmpi_create:') @@ -167,10 +175,18 @@ program main + sum_size, ' bytes yet to be freed' endif + timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, + + MPI_DOUBLE_PRECISION, MPI_MAX, + + MPI_COMM_WORLD, ierr) if (rank .eq. 0) then + if (.NOT. keep_files) then + err = nfmpi_delete(out_path, MPI_INFO_NULL) + end if + msg = '*** TESTING F77 '//cmd(1:XTRIM(cmd))// - + ' for flexible API ' - call pass_fail(nerrs, msg) + + ' - flexible API ' + call pass_fail(nerrs, msg, timing) endif 999 call MPI_Finalize(ierr) diff --git a/test/testcases/check_striping.c b/test/testcases/check_striping.c index ff36e0215a..18701a7d13 100644 --- a/test/testcases/check_striping.c +++ b/test/testcases/check_striping.c @@ -29,19 +29,26 @@ #include -static int -tst_fmt(char *filename, int cmode) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { int err, nerrs=0, ncid, fmt; int striping_size=0, striping_count=0, root_striping_size, root_striping_count; - cmode |= NC_CLOBBER; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); CHECK_ERR + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR err = ncmpi_inq_format(ncid, &fmt); CHECK_ERR err = ncmpi_inq_striping(ncid, &striping_size, &striping_count); - if (fmt == NC_FORMAT_NETCDF4 || fmt == NC_FORMAT_NETCDF4_CLASSIC) + if (format == NC_FORMAT_NETCDF4 || format == NC_FORMAT_NETCDF4_CLASSIC) EXP_ERR(NC_ENOTSUPPORT) else CHECK_ERR @@ -68,61 +75,27 @@ tst_fmt(char *filename, int cmode) return nerrs; } -int main(int argc, char** argv) { - char filename[256], *hint_value; - int rank, err, nerrs=0, bb_enabled=0; +int main(int argc, char **argv) { + + int err; + loop_opts opt; MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for striping info ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 0; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 0; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ - /* check whether burst buffering is enabled */ - if (inq_env_hint("nc_burst_buf", &hint_value)) { - if (strcasecmp(hint_value, "enable") == 0) bb_enabled = 1; - free(hint_value); - } + err = tst_main(argc, argv, "inquire striping info", opt, test_io); - nerrs += tst_fmt(filename, 0); - nerrs += tst_fmt(filename, NC_64BIT_OFFSET); - if (!bb_enabled) { -#ifdef ENABLE_NETCDF4 - nerrs += tst_fmt(filename, NC_NETCDF4); - nerrs += tst_fmt(filename, NC_NETCDF4 | NC_CLASSIC_MODEL); -#endif - } - nerrs += tst_fmt(filename, NC_64BIT_DATA); - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } - - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/check_type.c b/test/testcases/check_type.c index 36a559b400..b98be7a28e 100644 --- a/test/testcases/check_type.c +++ b/test/testcases/check_type.c @@ -53,19 +53,26 @@ static char* etype_name(nc_type etype) { } } -static int -tst_fmt(char *filename, int cmode) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) { char *varname[12], buf[1024], attname[256]; int i, err, nerrs=0, ncid, dimid, varid[12], max_type; - if (cmode == 0 || cmode == NC_64BIT_OFFSET || cmode & NC_CLASSIC_MODEL) - max_type = NC_DOUBLE; - else + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + if (format == NC_FORMAT_64BIT_DATA) max_type = NC_UINT64; + else + max_type = NC_DOUBLE; - cmode |= NC_CLOBBER; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "x", 2, &dimid); CHECK_ERR varname[0] = "var_nat"; @@ -173,65 +180,27 @@ tst_fmt(char *filename, int cmode) return nerrs; } -int main(int argc, char* argv[]) -{ - char filename[256], *hint_value; - int err, nerrs=0, rank, bb_enabled=0; - - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); +int main(int argc, char **argv) { - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for checking for type conflict ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + int err; + loop_opts opt; - /* check whether burst buffering is enabled */ - if (inq_env_hint("nc_burst_buf", &hint_value)) { - if (strcasecmp(hint_value, "enable") == 0) bb_enabled = 1; - free(hint_value); - } + MPI_Init(&argc, &argv); - nerrs += tst_fmt(filename, 0); - nerrs += tst_fmt(filename, NC_64BIT_OFFSET); - if (!bb_enabled) { -#ifdef ENABLE_NETCDF4 - nerrs += tst_fmt(filename, NC_NETCDF4); - nerrs += tst_fmt(filename, NC_NETCDF4 | NC_CLASSIC_MODEL); -#endif - } - nerrs += tst_fmt(filename, NC_64BIT_DATA); - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + err = tst_main(argc, argv, "checking for type conflict", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + return err; } - diff --git a/test/testcases/collective_error.c b/test/testcases/collective_error.c index 6a22088a01..465cfa581c 100644 --- a/test/testcases/collective_error.c +++ b/test/testcases/collective_error.c @@ -32,9 +32,15 @@ #include static -int test_collective_error(char *filename, int safe_mode, int cmode) +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) { - int rank, nproc, ncid, err, nerrs=0, varid, dimids[1], req, status; + char *val; + int rank, nproc, ncid, err, nerrs=0, varid, dimids[1], req, status, exp; + int safe_mode=0; double buf[2]; MPI_Offset start[1], count[1]; MPI_Comm comm=MPI_COMM_WORLD; @@ -42,22 +48,27 @@ int test_collective_error(char *filename, int safe_mode, int cmode) MPI_Comm_rank(comm, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); + val = getenv("PNETCDF_SAFE_MODE"); + if (val != NULL && *val == 1) safe_mode = 1; + + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + /* Create a 2 element vector of doubles */ - cmode |= NC_CLOBBER; - err = ncmpi_create(comm, filename, cmode, MPI_INFO_NULL, &ncid); CHECK_ERR - err = ncmpi_def_dim(ncid, "dim", 2, &dimids[0]); CHECK_ERR - err = ncmpi_def_var(ncid, "var", NC_DOUBLE, 1, dimids, &varid); CHECK_ERR - err = ncmpi_enddef(ncid); CHECK_ERR + err = ncmpi_create(comm, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR_ALL + err = ncmpi_def_dim(ncid, "dim", 2, &dimids[0]); CHECK_ERR_ALL + err = ncmpi_def_var(ncid, "var", NC_DOUBLE, 1, dimids, &varid); CHECK_ERR_ALL + err = ncmpi_enddef(ncid); CHECK_ERR_ALL if (rank == 0) { start[0] = 0; count[0] = 2; } else if (rank == 1) { -#if defined(PNETCDF_RELAX_COORD_BOUND) && PNETCDF_RELAX_COORD_BOUND==1 - start[0] = 3; /* illegal for a start > defined shape */ -#else - start[0] = 2; /* illegal for a start >= defined shape */ -#endif + if (is_relax_coord_bound()) + start[0] = 3; /* illegal for a start > defined shape */ + else + start[0] = 2; /* illegal for a start >= defined shape */ count[0] = 0; } else { @@ -70,8 +81,9 @@ int test_collective_error(char *filename, int safe_mode, int cmode) err = ncmpi_put_vara_all(ncid, varid, start, count, buf, count[0], MPI_DOUBLE); - if ((safe_mode && nproc > 1) || rank == 1) EXP_ERR(NC_EINVALCOORDS) - else EXP_ERR(NC_NOERR) + if ((safe_mode && nproc > 1) || rank == 1) exp = NC_EINVALCOORDS; + else exp = NC_NOERR; + CHECK_EXP_ERR_ALL(exp) /* check if user put buffer contents altered */ if (buf[0] != 1.0) { @@ -86,8 +98,9 @@ int test_collective_error(char *filename, int safe_mode, int cmode) } err = ncmpi_put_vara_double_all(ncid, varid, start, count, buf); - if ((safe_mode && nproc > 1) || rank == 1) EXP_ERR(NC_EINVALCOORDS) - else EXP_ERR(NC_NOERR) + if ((safe_mode && nproc > 1) || rank == 1) exp = NC_EINVALCOORDS; + else exp = NC_NOERR; + CHECK_EXP_ERR_ALL(exp) /* check if user put buffer contents altered */ if (buf[0] != 1.0) { @@ -101,14 +114,12 @@ int test_collective_error(char *filename, int safe_mode, int cmode) nerrs++; } - if (!(cmode & NC_NETCDF4)) { + if (format != NC_FORMAT_NETCDF4 && format != NC_FORMAT_NETCDF4_CLASSIC) { err = ncmpi_iput_vara_double(ncid, varid, start, count, buf, &req); - if (rank == 1) - EXP_ERR(NC_EINVALCOORDS) - else - EXP_ERR(NC_NOERR) + exp = (rank == 1) ? NC_EINVALCOORDS : NC_NOERR; + EXP_ERR(exp) - err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR + err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR_ALL /* check if user put buffer contents altered */ if (buf[0] != 1.0) { @@ -123,58 +134,39 @@ int test_collective_error(char *filename, int safe_mode, int cmode) } } + /* file sync before reading */ + err = ncmpi_sync(ncid); + CHECK_ERR + MPI_Barrier(MPI_COMM_WORLD); + err = ncmpi_get_vara_all(ncid, varid, start, count, buf, count[0], MPI_DOUBLE); - if ((safe_mode && nproc > 1) || rank == 1) EXP_ERR(NC_EINVALCOORDS) - else EXP_ERR(NC_NOERR) + if ((safe_mode && nproc > 1) || rank == 1) exp = NC_EINVALCOORDS; + else exp = NC_NOERR; + CHECK_EXP_ERR_ALL(exp) err = ncmpi_get_vara_double_all(ncid, varid, start, count, buf); - if ((safe_mode && nproc > 1) || rank == 1) EXP_ERR(NC_EINVALCOORDS) - else EXP_ERR(NC_NOERR) + if ((safe_mode && nproc > 1) || rank == 1) exp = NC_EINVALCOORDS; + else exp = NC_NOERR; + CHECK_EXP_ERR_ALL(exp) - if (!(cmode & NC_NETCDF4)) { + if (format != NC_FORMAT_NETCDF4 && format != NC_FORMAT_NETCDF4_CLASSIC) { err = ncmpi_iget_vara_double(ncid, varid, start, count, buf, &req); - if (rank == 1) - EXP_ERR(NC_EINVALCOORDS) - else - EXP_ERR(NC_NOERR) + exp = (rank == 1) ? NC_EINVALCOORDS : NC_NOERR; + EXP_ERR(exp) - err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR + err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR_ALL } - err = ncmpi_close(ncid); CHECK_ERR + err = ncmpi_close(ncid); CHECK_ERR_ALL +err_out: return nerrs; } -int main(int argc, char *argv[]) +#if 0 { - char filename[256], *hint_value; - int rank, err, nerrs=0, bb_enabled=0; - - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for collective abort ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - - /* check whether burst buffering is enabled */ - if (inq_env_hint("nc_burst_buf", &hint_value)) { - if (strcasecmp(hint_value, "enable") == 0) bb_enabled = 1; - free(hint_value); - } + int err, nerrs=0; /* test in non-safe mode */ setenv("PNETCDF_SAFE_MODE", "0", 1); @@ -210,24 +202,31 @@ int main(int argc, char *argv[]) nerrs += test_collective_error(filename, 1, NC_64BIT_DATA); if (nerrs) goto err_out; - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return (nerrs > 0); +} +#endif -err_out: - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "collective abort", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/testcases/erange_fill.m4 b/test/testcases/erange_fill.m4 index b9a1f6321c..50e706215a 100644 --- a/test/testcases/erange_fill.m4 +++ b/test/testcases/erange_fill.m4 @@ -39,6 +39,8 @@ dnl */ #define LEN 12 +static int bb_enabled; + include(`foreach.m4')dnl include(`utils.m4')dnl @@ -77,121 +79,137 @@ define(`ITYPE_SIZE',`ifelse( `$1', `longlong', `8',dnl `$1', `ulonglong', `8')')dnl +define(`CHECK_DEFAULT_FILL_VALUE',` + err = ncmpi_inq_varid(ncid, "var_$1", &varid); CHECK_ERR + if (coll_io) + err = GET_VAR($1,_all)(ncid, varid, buf_$1); + else + err = GET_VAR($1)(ncid, varid, buf_$1); + CHECK_ERR + for (i=0; i 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for erange elements are filled ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ + char hint[MPI_MAX_INFO_VAL]; + int err, nerrs=0, flag; - /*---- CDF-2 format -----------------------------------------------------*/ - /* ncmpi_set_default_format(NC_FORMAT_CLASSIC, NULL); */ - ncmpi_set_default_format(NC_FORMAT_CDF2, NULL); + MPI_Info_get(info, "nc_burst_buf", MPI_MAX_INFO_VAL-1, hint, &flag); + if (flag && strcasecmp(hint, "enable") == 0) + bb_enabled = 1; + else + bb_enabled = 0; - nerrs += test_default_fill_mode(filename); + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR - foreach(`itype', (CDF2_ITYPE_LIST), ` - _CAT(`nerrs += test_default_fill_',itype)'`(filename);') + nerrs += test_default_fill_mode(out_path, format, coll_io, info); - fillv=99; - foreach(`itype', (CDF2_ITYPE_LIST), ` - _CAT(`nerrs += test_user_fill_',itype)'`(filename, (itype)fillv);') + nerrs += test_user_fill_mode(out_path, format, coll_io, info); /* test put ERANGE values */ foreach(`itype', (uchar,short,int,float,double), ` - _CAT(`nerrs += test_erange_put_schar_',itype)'`(filename);') + _CAT(`nerrs += test_erange_put_schar_',itype)'`(out_path, coll_io, info);') foreach(`itype', (ushort,int,uint,float,double), ` - _CAT(`nerrs += test_erange_put_short_',itype)'`(filename);') + _CAT(`nerrs += test_erange_put_short_',itype)'`(out_path, coll_io, info);') foreach(`itype', (float,double), ` - _CAT(`nerrs += test_erange_put_int_',itype)'`(filename);') + _CAT(`nerrs += test_erange_put_int_',itype)'`(out_path, coll_io, info);') - nerrs += test_erange_put_float_double(filename); + nerrs += test_erange_put_float_double(out_path, coll_io, info); /* test get ERANGE values */ foreach(`itype', (short,int,float,double), ` - _CAT(`nerrs += test_erange_get_',itype)'`_schar(filename);') + _CAT(`nerrs += test_erange_get_',itype)'`_schar(out_path, coll_io, info);') foreach(`itype', (schar,short,int,float,double), ` - _CAT(`nerrs += test_erange_get_',itype)'`_uchar(filename);') + _CAT(`nerrs += test_erange_get_',itype)'`_uchar(out_path, coll_io, info);') foreach(`itype', (int,float,double), ` - _CAT(`nerrs += test_erange_get_',itype)'`_short(filename);') + _CAT(`nerrs += test_erange_get_',itype)'`_short(out_path, coll_io, info);') foreach(`itype', (float,double), ` - _CAT(`nerrs += test_erange_get_',itype)'`_int(filename);') + _CAT(`nerrs += test_erange_get_',itype)'`_int(out_path, coll_io, info);') - nerrs += test_erange_get_double_float(filename); + nerrs += test_erange_get_double_float(out_path, coll_io, info); /*---- CDF-5 format -----------------------------------------------------*/ - ncmpi_set_default_format(NC_FORMAT_CDF5, NULL); + if (format == NC_FORMAT_CDF5) { - nerrs += test_default_fill_mode(filename); + /* test put ERANGE values */ + foreach(`itype', (ushort,uint,longlong,ulonglong), ` + _CAT(`nerrs += test_erange_put_schar_',itype)'`(out_path, coll_io, info);') - foreach(`itype', (ITYPE_LIST), ` - _CAT(`nerrs += test_default_fill_',itype)'`(filename);') + foreach(`itype', (schar,short,ushort,int,uint,float,double,longlong,ulonglong), ` + _CAT(`nerrs += test_erange_put_uchar_',itype)'`(out_path, coll_io, info);') - fillv=99; - foreach(`itype', (ITYPE_LIST), ` - _CAT(`nerrs += test_user_fill_',itype)'`(filename, (itype)fillv);') + foreach(`itype', (longlong,ulonglong), ` + _CAT(`nerrs += test_erange_put_short_',itype)'`(out_path, coll_io, info);') - /* test put ERANGE values */ - foreach(`itype', (uchar,short,ushort,int,uint,float,double,longlong,ulonglong), ` - _CAT(`nerrs += test_erange_put_schar_',itype)'`(filename);') + foreach(`itype', (short,int,uint,float,double,longlong,ulonglong), ` + _CAT(`nerrs += test_erange_put_ushort_',itype)'`(out_path, coll_io, info);') - foreach(`itype', (schar,short,ushort,int,uint,float,double,longlong,ulonglong), ` - _CAT(`nerrs += test_erange_put_uchar_',itype)'`(filename);') + foreach(`itype', (uint,longlong,ulonglong), ` + _CAT(`nerrs += test_erange_put_int_',itype)'`(out_path, coll_io, info);') - foreach(`itype', (ushort,int,uint,float,double,longlong,ulonglong), ` - _CAT(`nerrs += test_erange_put_short_',itype)'`(filename);') + foreach(`itype', (int,float,double,longlong,ulonglong), ` + _CAT(`nerrs += test_erange_put_uint_',itype)'`(out_path, coll_io, info);') - foreach(`itype', (short,int,uint,float,double,longlong,ulonglong), ` - _CAT(`nerrs += test_erange_put_ushort_',itype)'`(filename);') + nerrs += test_erange_put_float_double(out_path, coll_io, info); - foreach(`itype', (uint,float,double,longlong,ulonglong), ` - _CAT(`nerrs += test_erange_put_int_',itype)'`(filename);') + /* test get ERANGE values */ + foreach(`itype', (uchar,ushort,uint,longlong,ulonglong), ` + _CAT(`nerrs += test_erange_get_',itype)'`_schar(out_path, coll_io, info);') - foreach(`itype', (int,float,double,longlong,ulonglong), ` - _CAT(`nerrs += test_erange_put_uint_',itype)'`(filename);') + foreach(`itype', (ushort,uint,longlong,ulonglong), ` + _CAT(`nerrs += test_erange_get_',itype)'`_uchar(out_path, coll_io, info);') - nerrs += test_erange_put_float_double(filename); + foreach(`itype', (ushort,uint,longlong,ulonglong), ` + _CAT(`nerrs += test_erange_get_',itype)'`_short(out_path, coll_io, info);') - /* test get ERANGE values */ - foreach(`itype', (uchar,short,ushort,int,uint,float,double,longlong,ulonglong), ` - _CAT(`nerrs += test_erange_get_',itype)'`_schar(filename);') + foreach(`itype', (short,int,uint,float,double,longlong,ulonglong), ` + _CAT(`nerrs += test_erange_get_',itype)'`_ushort(out_path, coll_io, info);') - foreach(`itype', (schar,short,ushort,int,uint,float,double,longlong,ulonglong), ` - _CAT(`nerrs += test_erange_get_',itype)'`_uchar(filename);') + foreach(`itype', (uint,longlong,ulonglong), ` + _CAT(`nerrs += test_erange_get_',itype)'`_int(out_path, coll_io, info);') - foreach(`itype', (ushort,int,uint,float,double,longlong,ulonglong), ` - _CAT(`nerrs += test_erange_get_',itype)'`_short(filename);') + foreach(`itype', (int,float,double,longlong,ulonglong), ` + _CAT(`nerrs += test_erange_get_',itype)'`_uint(out_path, coll_io, info);') + } - foreach(`itype', (short,int,uint,float,double,longlong,ulonglong), ` - _CAT(`nerrs += test_erange_get_',itype)'`_ushort(filename);') + return nerrs; +} - foreach(`itype', (uint,float,double,longlong,ulonglong), ` - _CAT(`nerrs += test_erange_get_',itype)'`_int(filename);') +int main(int argc, char **argv) { - foreach(`itype', (int,float,double,longlong,ulonglong), ` - _CAT(`nerrs += test_erange_get_',itype)'`_uint(filename);') + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; - nerrs += test_erange_get_double_float(filename); + MPI_Init(&argc, &argv); - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + err = tst_main(argc, argv, "erange elements are filled", opt, test_io); MPI_Finalize(); - return (nerrs == 0) ? 0 : 1; -} + return err; +} diff --git a/test/testcases/error_precedence.m4 b/test/testcases/error_precedence.m4 index 6b26b0c387..681b662748 100644 --- a/test/testcases/error_precedence.m4 +++ b/test/testcases/error_precedence.m4 @@ -77,8 +77,8 @@ dnl #define EndDef_ ncmpi__enddef #define FileClose ncmpi_close #define StrError ncmpi_strerror -#define FileCreate(a,b,c) ncmpi_create(MPI_COMM_WORLD,a,b,MPI_INFO_NULL,c) -#define FileOpen(a,b,c) ncmpi_open(MPI_COMM_WORLD,a,b,MPI_INFO_NULL,c) +#define FileCreate(a,b,c) ncmpi_create(MPI_COMM_WORLD,a,b,info,c) +#define FileOpen(a,b,c) ncmpi_open(MPI_COMM_WORLD,a,b,info,c) #define API(kind) ncmpi_##kind #define API_ALL(kind) ncmpi_##kind##_all #endif @@ -131,9 +131,9 @@ define(`EXTRA_ITYPES',`uchar,ushort,uint,longlong,ulonglong')dnl define(`TEST_FORMAT',dnl `dnl static int -test_format_nc$1(char *filename) +test_format_nc$1(const char *out_path, int coll_io, MPI_Info info) { - int err, nerrs=0, ncid, cmode, dimids[2]; + int err, nerrs=0, ncid, dimids[2]; MPI_Offset start[2], count[2]; #ifdef TEST_NETCDF ptrdiff_t stride[2]; @@ -160,16 +160,10 @@ test_format_nc$1(char *filename) dnl #define NC_FORMAT_64BIT_DATA (5) /* create a new file */ - ifelse(`$1',`2',`cmode = NC_CLOBBER | NC_64BIT_OFFSET;', - `$1',`5',`cmode = NC_CLOBBER | NC_64BIT_DATA;', - `$1',`3',`cmode = NC_CLOBBER | NC_NETCDF4;', - `$1',`4',`cmode = NC_CLOBBER | NC_NETCDF4 | NC_CLASSIC_MODEL;', - `cmode = NC_CLOBBER;')dnl - - err=FileCreate(filename, cmode, &ncid); + err=FileCreate(out_path, NC_CLOBBER, &ncid); if (err != NC_NOERR) { printf("Error at line %d in %s: FileCreate() file %s (%s)\n", - __LINE__,__FILE__,filename,StrError(err)); + __LINE__,__FILE__,out_path,StrError(err)); MPI_Abort(MPI_COMM_WORLD, -1); exit(1); } @@ -196,6 +190,9 @@ test_format_nc$1(char *filename) foreach(`itype',(text, TYPE_LIST),`_CAT(` err=API(def_var)(ncid,"var_'itype`",NC_TYPE(itype),2,dimids,&vid_',itype`); CHECK_ERR')') + foreach(`itype',(text, TYPE_LIST),`_CAT(` + err=API(def_var_fill)(ncid, vid_'itype`, 0, NULL); CHECK_ERR')') + /* For put attribute APIs, the error precedence is the following: * NC_EBADID, NC_EPERM, NC_ENOTVAR, NC_EBADNAME, NC_EBADTYPE, NC_ECHAR, * NC_EINVAL, NC_ENOTINDEFINE, NC_ERANGE @@ -267,22 +264,44 @@ test_format_nc$1(char *filename) err=API(del_att)(ncid,vid_'itype`,`"att_'itype`"'); CHECK_ERR')') /* test put_var APIs in define mode */ - ifelse(`$1',`3',`',`/* test NC_EINDEFINE */dnl - foreach(`itype',(text, TYPE_LIST),`_CAT(` - err=API_ALL(put_var_'itype`) (ncid,-999,NULL); EXP_ERR(NC_EINDEFINE) - err=API_ALL(put_var1_'itype`)(ncid,-999,NULL,NULL); EXP_ERR(NC_EINDEFINE) - err=API_ALL(put_vara_'itype`)(ncid,-999,NULL,NULL,NULL); EXP_ERR(NC_EINDEFINE) - err=API_ALL(put_vars_'itype`)(ncid,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINDEFINE) - err=API_ALL(put_varm_'itype`)(ncid,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINDEFINE)')')') + if (coll_io) { + ifelse(`$1',`3',`',`/* test NC_EINDEFINE */dnl + foreach(`itype',(text, TYPE_LIST),`_CAT(` + err=API_ALL(put_var_'itype`) (ncid,-999,NULL); EXP_ERR(NC_EINDEFINE) + err=API_ALL(put_var1_'itype`)(ncid,-999,NULL,NULL); EXP_ERR(NC_EINDEFINE) + err=API_ALL(put_vara_'itype`)(ncid,-999,NULL,NULL,NULL); EXP_ERR(NC_EINDEFINE) + err=API_ALL(put_vars_'itype`)(ncid,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINDEFINE) + err=API_ALL(put_varm_'itype`)(ncid,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINDEFINE)')')') + } + else { + ifelse(`$1',`3',`',`/* test NC_EINDEFINE */dnl + foreach(`itype',(text, TYPE_LIST),`_CAT(` + err=API(put_var_'itype`) (ncid,-999,NULL); EXP_ERR(NC_EINDEFINE) + err=API(put_var1_'itype`)(ncid,-999,NULL,NULL); EXP_ERR(NC_EINDEFINE) + err=API(put_vara_'itype`)(ncid,-999,NULL,NULL,NULL); EXP_ERR(NC_EINDEFINE) + err=API(put_vars_'itype`)(ncid,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINDEFINE) + err=API(put_varm_'itype`)(ncid,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINDEFINE)')')') + } /* test put_var APIs in define mode */ - ifelse(`$1',`3',`',`/* test NC_EINDEFINE */dnl - foreach(`itype',(text, TYPE_LIST),`_CAT(` - err=API_ALL(get_var_'itype`) (ncid,-999,NULL); EXP_ERR(NC_EINDEFINE) - err=API_ALL(get_var1_'itype`)(ncid,-999,NULL,NULL); EXP_ERR(NC_EINDEFINE) - err=API_ALL(get_vara_'itype`)(ncid,-999,NULL,NULL,NULL); EXP_ERR(NC_EINDEFINE) - err=API_ALL(get_vars_'itype`)(ncid,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINDEFINE) - err=API_ALL(get_varm_'itype`)(ncid,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINDEFINE)')')') + if (coll_io) { + ifelse(`$1',`3',`',`/* test NC_EINDEFINE */dnl + foreach(`itype',(text, TYPE_LIST),`_CAT(` + err=API_ALL(get_var_'itype`) (ncid,-999,NULL); EXP_ERR(NC_EINDEFINE) + err=API_ALL(get_var1_'itype`)(ncid,-999,NULL,NULL); EXP_ERR(NC_EINDEFINE) + err=API_ALL(get_vara_'itype`)(ncid,-999,NULL,NULL,NULL); EXP_ERR(NC_EINDEFINE) + err=API_ALL(get_vars_'itype`)(ncid,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINDEFINE) + err=API_ALL(get_varm_'itype`)(ncid,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINDEFINE)')')') + } + else { + ifelse(`$1',`3',`',`/* test NC_EINDEFINE */dnl + foreach(`itype',(text, TYPE_LIST),`_CAT(` + err=API(get_var_'itype`) (ncid,-999,NULL); EXP_ERR(NC_EINDEFINE) + err=API(get_var1_'itype`)(ncid,-999,NULL,NULL); EXP_ERR(NC_EINDEFINE) + err=API(get_vara_'itype`)(ncid,-999,NULL,NULL,NULL); EXP_ERR(NC_EINDEFINE) + err=API(get_vars_'itype`)(ncid,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINDEFINE) + err=API(get_varm_'itype`)(ncid,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINDEFINE)')')') + } /* test NC_EBADID */ err=EndDef(-999); EXP_ERR(NC_EBADID) @@ -291,6 +310,11 @@ test_format_nc$1(char *filename) /* leave define mode and enter data mode */ err=EndDef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* attribute att_text has been deleted */ foreach(`itype',(text, TYPE_LIST),`_CAT(` err=API(inq_att)(ncid,vid_'itype`,`"att_'itype`"',NULL,NULL); EXP_ERR(NC_ENOTATT)')') @@ -320,74 +344,146 @@ test_format_nc$1(char *filename) */ /* test NC_EBADID */dnl - foreach(`itype',(text, TYPE_LIST),`_CAT(` - err=API_ALL(put_var_'itype`) (-999,-999,NULL); EXP_ERR(NC_EBADID) - err=API_ALL(put_var1_'itype`)(-999,-999,NULL,NULL); EXP_ERR(NC_EBADID) - err=API_ALL(put_vara_'itype`)(-999,-999,NULL,NULL,NULL); EXP_ERR(NC_EBADID) - err=API_ALL(put_vars_'itype`)(-999,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_EBADID) - err=API_ALL(put_varm_'itype`)(-999,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EBADID) - err=API_ALL(get_var_'itype`) (-999,-999,NULL); EXP_ERR(NC_EBADID) - err=API_ALL(get_var1_'itype`)(-999,-999,NULL,NULL); EXP_ERR(NC_EBADID) - err=API_ALL(get_vara_'itype`)(-999,-999,NULL,NULL,NULL); EXP_ERR(NC_EBADID) - err=API_ALL(get_vars_'itype`)(-999,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_EBADID) - err=API_ALL(get_varm_'itype`)(-999,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EBADID) -')') + if (coll_io) { + foreach(`itype',(text, TYPE_LIST),`_CAT(` + err=API_ALL(put_var_'itype`) (-999,-999,NULL); EXP_ERR(NC_EBADID) + err=API_ALL(put_var1_'itype`)(-999,-999,NULL,NULL); EXP_ERR(NC_EBADID) + err=API_ALL(put_vara_'itype`)(-999,-999,NULL,NULL,NULL); EXP_ERR(NC_EBADID) + err=API_ALL(put_vars_'itype`)(-999,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_EBADID) + err=API_ALL(put_varm_'itype`)(-999,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EBADID) + err=API_ALL(get_var_'itype`) (-999,-999,NULL); EXP_ERR(NC_EBADID) + err=API_ALL(get_var1_'itype`)(-999,-999,NULL,NULL); EXP_ERR(NC_EBADID) + err=API_ALL(get_vara_'itype`)(-999,-999,NULL,NULL,NULL); EXP_ERR(NC_EBADID) + err=API_ALL(get_vars_'itype`)(-999,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_EBADID) + err=API_ALL(get_varm_'itype`)(-999,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EBADID) + ')') + } + else { + foreach(`itype',(text, TYPE_LIST),`_CAT(` + err=API(put_var_'itype`) (-999,-999,NULL); EXP_ERR(NC_EBADID) + err=API(put_var1_'itype`)(-999,-999,NULL,NULL); EXP_ERR(NC_EBADID) + err=API(put_vara_'itype`)(-999,-999,NULL,NULL,NULL); EXP_ERR(NC_EBADID) + err=API(put_vars_'itype`)(-999,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_EBADID) + err=API(put_varm_'itype`)(-999,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EBADID) + err=API(get_var_'itype`) (-999,-999,NULL); EXP_ERR(NC_EBADID) + err=API(get_var1_'itype`)(-999,-999,NULL,NULL); EXP_ERR(NC_EBADID) + err=API(get_vara_'itype`)(-999,-999,NULL,NULL,NULL); EXP_ERR(NC_EBADID) + err=API(get_vars_'itype`)(-999,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_EBADID) + err=API(get_varm_'itype`)(-999,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EBADID) + ')') + } /* test NC_ENOTVAR */dnl - foreach(`itype',(text, TYPE_LIST),`_CAT(` - err=API_ALL(put_var_'itype`) (ncid,-999,NULL); EXP_ERR(NC_ENOTVAR) - err=API_ALL(put_var1_'itype`)(ncid,-999,NULL,NULL); EXP_ERR(NC_ENOTVAR) - err=API_ALL(put_vara_'itype`)(ncid,-999,NULL,NULL,NULL); EXP_ERR(NC_ENOTVAR) - err=API_ALL(put_vars_'itype`)(ncid,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_ENOTVAR) - err=API_ALL(put_varm_'itype`)(ncid,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_ENOTVAR) - err=API_ALL(get_var_'itype`) (ncid,-999,NULL); EXP_ERR(NC_ENOTVAR) - err=API_ALL(get_var1_'itype`)(ncid,-999,NULL,NULL); EXP_ERR(NC_ENOTVAR) - err=API_ALL(get_vara_'itype`)(ncid,-999,NULL,NULL,NULL); EXP_ERR(NC_ENOTVAR) - err=API_ALL(get_vars_'itype`)(ncid,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_ENOTVAR) - err=API_ALL(get_varm_'itype`)(ncid,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_ENOTVAR) -')') + if (coll_io) { + foreach(`itype',(text, TYPE_LIST),`_CAT(` + err=API_ALL(put_var_'itype`) (ncid,-999,NULL); EXP_ERR(NC_ENOTVAR) + err=API_ALL(put_var1_'itype`)(ncid,-999,NULL,NULL); EXP_ERR(NC_ENOTVAR) + err=API_ALL(put_vara_'itype`)(ncid,-999,NULL,NULL,NULL); EXP_ERR(NC_ENOTVAR) + err=API_ALL(put_vars_'itype`)(ncid,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_ENOTVAR) + err=API_ALL(put_varm_'itype`)(ncid,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_ENOTVAR) + err=API_ALL(get_var_'itype`) (ncid,-999,NULL); EXP_ERR(NC_ENOTVAR) + err=API_ALL(get_var1_'itype`)(ncid,-999,NULL,NULL); EXP_ERR(NC_ENOTVAR) + err=API_ALL(get_vara_'itype`)(ncid,-999,NULL,NULL,NULL); EXP_ERR(NC_ENOTVAR) + err=API_ALL(get_vars_'itype`)(ncid,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_ENOTVAR) + err=API_ALL(get_varm_'itype`)(ncid,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_ENOTVAR) + ')') + } + else { + foreach(`itype',(text, TYPE_LIST),`_CAT(` + err=API(put_var_'itype`) (ncid,-999,NULL); EXP_ERR(NC_ENOTVAR) + err=API(put_var1_'itype`)(ncid,-999,NULL,NULL); EXP_ERR(NC_ENOTVAR) + err=API(put_vara_'itype`)(ncid,-999,NULL,NULL,NULL); EXP_ERR(NC_ENOTVAR) + err=API(put_vars_'itype`)(ncid,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_ENOTVAR) + err=API(put_varm_'itype`)(ncid,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_ENOTVAR) + err=API(get_var_'itype`) (ncid,-999,NULL); EXP_ERR(NC_ENOTVAR) + err=API(get_var1_'itype`)(ncid,-999,NULL,NULL); EXP_ERR(NC_ENOTVAR) + err=API(get_vara_'itype`)(ncid,-999,NULL,NULL,NULL); EXP_ERR(NC_ENOTVAR) + err=API(get_vars_'itype`)(ncid,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_ENOTVAR) + err=API(get_varm_'itype`)(ncid,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_ENOTVAR) + ')') + } /* test NC_EINVALCOORDS */ start[0] = Y_LEN; start[1] = X_LEN; - foreach(`itype',(text, TYPE_LIST),`_CAT(` - err=API_ALL(put_var1_'itype`)(ncid,vid_'itype`,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) - err=API_ALL(put_vara_'itype`)(ncid,vid_'itype`,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) - err=API_ALL(put_vars_'itype`)(ncid,vid_'itype`,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) - err=API_ALL(put_varm_'itype`)(ncid,vid_'itype`,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) - err=API_ALL(put_var1_'itype`)(ncid,vid_'itype`,start,NULL); EXP_ERR(NC_EINVALCOORDS) - err=API_ALL(put_vara_'itype`)(ncid,vid_'itype`,start,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) - err=API_ALL(put_vars_'itype`)(ncid,vid_'itype`,start,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) - err=API_ALL(put_varm_'itype`)(ncid,vid_'itype`,start,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) - err=API_ALL(get_var1_'itype`)(ncid,vid_'itype`,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) - err=API_ALL(get_vara_'itype`)(ncid,vid_'itype`,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) - err=API_ALL(get_vars_'itype`)(ncid,vid_'itype`,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) - err=API_ALL(get_varm_'itype`)(ncid,vid_'itype`,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) - err=API_ALL(get_var1_'itype`)(ncid,vid_'itype`,start,NULL); EXP_ERR(NC_EINVALCOORDS) - err=API_ALL(get_vara_'itype`)(ncid,vid_'itype`,start,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) - err=API_ALL(get_vars_'itype`)(ncid,vid_'itype`,start,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) - err=API_ALL(get_varm_'itype`)(ncid,vid_'itype`,start,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) -')') + if (coll_io) { + foreach(`itype',(text, TYPE_LIST),`_CAT(` + err=API_ALL(put_var1_'itype`)(ncid,vid_'itype`,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API_ALL(put_vara_'itype`)(ncid,vid_'itype`,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API_ALL(put_vars_'itype`)(ncid,vid_'itype`,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API_ALL(put_varm_'itype`)(ncid,vid_'itype`,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API_ALL(put_var1_'itype`)(ncid,vid_'itype`,start,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API_ALL(put_vara_'itype`)(ncid,vid_'itype`,start,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API_ALL(put_vars_'itype`)(ncid,vid_'itype`,start,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API_ALL(put_varm_'itype`)(ncid,vid_'itype`,start,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API_ALL(get_var1_'itype`)(ncid,vid_'itype`,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API_ALL(get_vara_'itype`)(ncid,vid_'itype`,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API_ALL(get_vars_'itype`)(ncid,vid_'itype`,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API_ALL(get_varm_'itype`)(ncid,vid_'itype`,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API_ALL(get_var1_'itype`)(ncid,vid_'itype`,start,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API_ALL(get_vara_'itype`)(ncid,vid_'itype`,start,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API_ALL(get_vars_'itype`)(ncid,vid_'itype`,start,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API_ALL(get_varm_'itype`)(ncid,vid_'itype`,start,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + ')') + } + else { + foreach(`itype',(text, TYPE_LIST),`_CAT(` + err=API(put_var1_'itype`)(ncid,vid_'itype`,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API(put_vara_'itype`)(ncid,vid_'itype`,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API(put_vars_'itype`)(ncid,vid_'itype`,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API(put_varm_'itype`)(ncid,vid_'itype`,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API(put_var1_'itype`)(ncid,vid_'itype`,start,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API(put_vara_'itype`)(ncid,vid_'itype`,start,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API(put_vars_'itype`)(ncid,vid_'itype`,start,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API(put_varm_'itype`)(ncid,vid_'itype`,start,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API(get_var1_'itype`)(ncid,vid_'itype`,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API(get_vara_'itype`)(ncid,vid_'itype`,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API(get_vars_'itype`)(ncid,vid_'itype`,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API(get_varm_'itype`)(ncid,vid_'itype`,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API(get_var1_'itype`)(ncid,vid_'itype`,start,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API(get_vara_'itype`)(ncid,vid_'itype`,start,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API(get_vars_'itype`)(ncid,vid_'itype`,start,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + err=API(get_varm_'itype`)(ncid,vid_'itype`,start,NULL,NULL,NULL,NULL); EXP_ERR(NC_EINVALCOORDS) + ')') + } /* test NC_EEDGE */ start[0] = 0; start[1] = 0; count[0] = Y_LEN; count[1] = X_LEN + 1; - foreach(`itype',(text, TYPE_LIST),`_CAT(` - err=API_ALL(put_vara_'itype`)(ncid,vid_'itype`,start,NULL, NULL); EXP_ERR(NC_EEDGE) - err=API_ALL(put_vars_'itype`)(ncid,vid_'itype`,start,NULL, NULL,NULL); EXP_ERR(NC_EEDGE) - err=API_ALL(put_varm_'itype`)(ncid,vid_'itype`,start,NULL, NULL,NULL,NULL); EXP_ERR(NC_EEDGE) - err=API_ALL(put_vara_'itype`)(ncid,vid_'itype`,start,count,NULL); EXP_ERR(NC_EEDGE) - err=API_ALL(put_vars_'itype`)(ncid,vid_'itype`,start,count,NULL,NULL); EXP_ERR(NC_EEDGE) - err=API_ALL(put_varm_'itype`)(ncid,vid_'itype`,start,count,NULL,NULL,NULL); EXP_ERR(NC_EEDGE) - err=API_ALL(get_vara_'itype`)(ncid,vid_'itype`,start,NULL, NULL); EXP_ERR(NC_EEDGE) - err=API_ALL(get_vars_'itype`)(ncid,vid_'itype`,start,NULL, NULL,NULL); EXP_ERR(NC_EEDGE) - err=API_ALL(get_varm_'itype`)(ncid,vid_'itype`,start,NULL, NULL,NULL,NULL); EXP_ERR(NC_EEDGE) - err=API_ALL(get_vara_'itype`)(ncid,vid_'itype`,start,count,NULL); EXP_ERR(NC_EEDGE) - err=API_ALL(get_vars_'itype`)(ncid,vid_'itype`,start,count,NULL,NULL); EXP_ERR(NC_EEDGE) - err=API_ALL(get_varm_'itype`)(ncid,vid_'itype`,start,count,NULL,NULL,NULL); EXP_ERR(NC_EEDGE) -')') + if (coll_io) { + foreach(`itype',(text, TYPE_LIST),`_CAT(` + err=API_ALL(put_vara_'itype`)(ncid,vid_'itype`,start,NULL, NULL); EXP_ERR(NC_EEDGE) + err=API_ALL(put_vars_'itype`)(ncid,vid_'itype`,start,NULL, NULL,NULL); EXP_ERR(NC_EEDGE) + err=API_ALL(put_varm_'itype`)(ncid,vid_'itype`,start,NULL, NULL,NULL,NULL); EXP_ERR(NC_EEDGE) + err=API_ALL(put_vara_'itype`)(ncid,vid_'itype`,start,count,NULL); EXP_ERR(NC_EEDGE) + err=API_ALL(put_vars_'itype`)(ncid,vid_'itype`,start,count,NULL,NULL); EXP_ERR(NC_EEDGE) + err=API_ALL(put_varm_'itype`)(ncid,vid_'itype`,start,count,NULL,NULL,NULL); EXP_ERR(NC_EEDGE) + err=API_ALL(get_vara_'itype`)(ncid,vid_'itype`,start,NULL, NULL); EXP_ERR(NC_EEDGE) + err=API_ALL(get_vars_'itype`)(ncid,vid_'itype`,start,NULL, NULL,NULL); EXP_ERR(NC_EEDGE) + err=API_ALL(get_varm_'itype`)(ncid,vid_'itype`,start,NULL, NULL,NULL,NULL); EXP_ERR(NC_EEDGE) + err=API_ALL(get_vara_'itype`)(ncid,vid_'itype`,start,count,NULL); EXP_ERR(NC_EEDGE) + err=API_ALL(get_vars_'itype`)(ncid,vid_'itype`,start,count,NULL,NULL); EXP_ERR(NC_EEDGE) + err=API_ALL(get_varm_'itype`)(ncid,vid_'itype`,start,count,NULL,NULL,NULL); EXP_ERR(NC_EEDGE) + ')') + } + else { + foreach(`itype',(text, TYPE_LIST),`_CAT(` + err=API(put_vara_'itype`)(ncid,vid_'itype`,start,NULL, NULL); EXP_ERR(NC_EEDGE) + err=API(put_vars_'itype`)(ncid,vid_'itype`,start,NULL, NULL,NULL); EXP_ERR(NC_EEDGE) + err=API(put_varm_'itype`)(ncid,vid_'itype`,start,NULL, NULL,NULL,NULL); EXP_ERR(NC_EEDGE) + err=API(put_vara_'itype`)(ncid,vid_'itype`,start,count,NULL); EXP_ERR(NC_EEDGE) + err=API(put_vars_'itype`)(ncid,vid_'itype`,start,count,NULL,NULL); EXP_ERR(NC_EEDGE) + err=API(put_varm_'itype`)(ncid,vid_'itype`,start,count,NULL,NULL,NULL); EXP_ERR(NC_EEDGE) + err=API(get_vara_'itype`)(ncid,vid_'itype`,start,NULL, NULL); EXP_ERR(NC_EEDGE) + err=API(get_vars_'itype`)(ncid,vid_'itype`,start,NULL, NULL,NULL); EXP_ERR(NC_EEDGE) + err=API(get_varm_'itype`)(ncid,vid_'itype`,start,NULL, NULL,NULL,NULL); EXP_ERR(NC_EEDGE) + err=API(get_vara_'itype`)(ncid,vid_'itype`,start,count,NULL); EXP_ERR(NC_EEDGE) + err=API(get_vars_'itype`)(ncid,vid_'itype`,start,count,NULL,NULL); EXP_ERR(NC_EEDGE) + err=API(get_varm_'itype`)(ncid,vid_'itype`,start,count,NULL,NULL,NULL); EXP_ERR(NC_EEDGE) + ')') + } /* test NC_ESTRIDE */ start[0] = start[1] = 0; @@ -395,22 +491,38 @@ test_format_nc$1(char *filename) count[1] = X_LEN; stride[0] = -1; stride[1] = -1; - foreach(`itype',(text, TYPE_LIST),`_CAT(` - err=API_ALL(put_vars_'itype`)(ncid,vid_'itype`,start,count,stride,NULL); EXP_ERR(NC_ESTRIDE) - err=API_ALL(put_varm_'itype`)(ncid,vid_'itype`,start,count,stride,NULL,NULL); EXP_ERR(NC_ESTRIDE) - err=API_ALL(get_vars_'itype`)(ncid,vid_'itype`,start,count,stride,NULL); EXP_ERR(NC_ESTRIDE) - err=API_ALL(get_varm_'itype`)(ncid,vid_'itype`,start,count,stride,NULL,NULL); EXP_ERR(NC_ESTRIDE) -')') + if (coll_io) { + foreach(`itype',(text, TYPE_LIST),`_CAT(` + err=API_ALL(put_vars_'itype`)(ncid,vid_'itype`,start,count,stride,NULL); EXP_ERR(NC_ESTRIDE) + err=API_ALL(put_varm_'itype`)(ncid,vid_'itype`,start,count,stride,NULL,NULL); EXP_ERR(NC_ESTRIDE) + err=API_ALL(get_vars_'itype`)(ncid,vid_'itype`,start,count,stride,NULL); EXP_ERR(NC_ESTRIDE) + err=API_ALL(get_varm_'itype`)(ncid,vid_'itype`,start,count,stride,NULL,NULL); EXP_ERR(NC_ESTRIDE) + ')') + } + else { + foreach(`itype',(text, TYPE_LIST),`_CAT(` + err=API(put_vars_'itype`)(ncid,vid_'itype`,start,count,stride,NULL); EXP_ERR(NC_ESTRIDE) + err=API(put_varm_'itype`)(ncid,vid_'itype`,start,count,stride,NULL,NULL); EXP_ERR(NC_ESTRIDE) + err=API(get_vars_'itype`)(ncid,vid_'itype`,start,count,stride,NULL); EXP_ERR(NC_ESTRIDE) + err=API(get_varm_'itype`)(ncid,vid_'itype`,start,count,stride,NULL,NULL); EXP_ERR(NC_ESTRIDE) + ')') + } /* close the file */ err=FileClose(-999); EXP_ERR(NC_EBADID) + + /* file sync before reading */ + err = ncmpi_sync(ncid); + CHECK_ERR + MPI_Barrier(MPI_COMM_WORLD); + err=FileClose(ncid); CHECK_ERR /* open the file with read-only permission */ - err=FileOpen(filename, NC_NOWRITE, &ncid); + err=FileOpen(out_path, NC_NOWRITE, &ncid); if (err != NC_NOERR) { printf("Error at line %d in %s: FileOpen() file %s (%s)\n", - __LINE__,__FILE__,filename,StrError(err)); + __LINE__,__FILE__,out_path,StrError(err)); MPI_Abort(MPI_COMM_WORLD, -1); exit(1); } @@ -418,6 +530,11 @@ test_format_nc$1(char *filename) /* test NC_EPERM */ err=ReDef(ncid); EXP_ERR(NC_EPERM) + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* test NC_EPERM for attribute APIs */dnl err=API(put_att_text) (-999,-999,NULL,-999,NULL); EXP_ERR(NC_EBADID) err=API(put_att_text) (ncid,-999,NULL,-999,NULL); EXP_ERR(NC_EPERM) @@ -439,12 +556,22 @@ test_format_nc$1(char *filename) err=API(put_att_'itype`)(ncid,vid_'itype`,`"att_'itype`"',NC_TYPE(itype),1,NULL); EXP_ERR(NC_EPERM)')') /* test NC_EPERM */dnl - foreach(`itype',(text, TYPE_LIST),`_CAT(` - err=API_ALL(put_var_'itype`) (ncid,-999,NULL); EXP_ERR(NC_EPERM) - err=API_ALL(put_var1_'itype`)(ncid,-999,NULL,NULL); EXP_ERR(NC_EPERM) - err=API_ALL(put_vara_'itype`)(ncid,-999,NULL,NULL,NULL); EXP_ERR(NC_EPERM) - err=API_ALL(put_vars_'itype`)(ncid,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_EPERM) - err=API_ALL(put_varm_'itype`)(ncid,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EPERM)')') + if (coll_io) { + foreach(`itype',(text, TYPE_LIST),`_CAT(` + err=API_ALL(put_var_'itype`) (ncid,-999,NULL); EXP_ERR(NC_EPERM) + err=API_ALL(put_var1_'itype`)(ncid,-999,NULL,NULL); EXP_ERR(NC_EPERM) + err=API_ALL(put_vara_'itype`)(ncid,-999,NULL,NULL,NULL); EXP_ERR(NC_EPERM) + err=API_ALL(put_vars_'itype`)(ncid,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_EPERM) + err=API_ALL(put_varm_'itype`)(ncid,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EPERM)')') + } + else { + foreach(`itype',(text, TYPE_LIST),`_CAT(` + err=API(put_var_'itype`) (ncid,-999,NULL); EXP_ERR(NC_EPERM) + err=API(put_var1_'itype`)(ncid,-999,NULL,NULL); EXP_ERR(NC_EPERM) + err=API(put_vara_'itype`)(ncid,-999,NULL,NULL,NULL); EXP_ERR(NC_EPERM) + err=API(put_vars_'itype`)(ncid,-999,NULL,NULL,NULL,NULL); EXP_ERR(NC_EPERM) + err=API(put_varm_'itype`)(ncid,-999,NULL,NULL,NULL,NULL,NULL); EXP_ERR(NC_EPERM)')') + } /* close the file */ err=FileClose(-999); EXP_ERR(NC_EBADID) @@ -468,65 +595,67 @@ test_format_nc$1(char *filename) TEST_FORMAT(1) TEST_FORMAT(2) TEST_FORMAT(5) -#if defined(ENABLE_NETCDF4) || defined(TEST_NETCDF) TEST_FORMAT(3) TEST_FORMAT(4) -#endif -/*----< main() >------------------------------------------------------------*/ -int main(int argc, char **argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - char filename[256]; - int rank=0, nerrs=0; + char val[MPI_MAX_INFO_VAL]; + int err, nerrs=0, flag; - MPI_Init(&argc,&argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); + /* check whether burst buffering is enabled */ + MPI_Info_get(info, "nc_burst_buf", MPI_MAX_INFO_VAL - 1, val, &flag); + if (flag && strcasecmp(val, "enable") == 0 && + (format == NC_FORMAT_NETCDF4 || format == NC_FORMAT_NETCDF4_CLASSIC)) + /* does not work for NetCDF4 files when burst-buffering is enabled */ + return 0; - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for error precedence ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } verbose = 0; - /* test all file formats separately */ - nerrs += test_format_nc1(filename); - nerrs += test_format_nc2(filename); -#if defined(ENABLE_NETCDF4) || defined(TEST_NETCDF) - nerrs += test_format_nc3(filename); /* NC_FORMAT_NETCDF4 */ - nerrs += test_format_nc4(filename); /* NC_FORMAT_NETCDF4_CLASSIC */ -#endif - nerrs += test_format_nc5(filename); - -#ifndef TEST_NETCDF - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - int err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + if (format == NC_FORMAT_CLASSIC) + nerrs = test_format_nc1(out_path, coll_io, info); + else if (format == NC_FORMAT_64BIT_OFFSET) + nerrs = test_format_nc2(out_path, coll_io, info); + else if (format == NC_FORMAT_NETCDF4) + nerrs = test_format_nc3(out_path, coll_io, info); + else if (format == NC_FORMAT_NETCDF4_CLASSIC) + nerrs = test_format_nc4(out_path, coll_io, info); + else if (format == NC_FORMAT_64BIT_DATA) + nerrs = test_format_nc5(out_path, coll_io, info); - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } -#endif - MPI_Finalize(); - return (nerrs > 0); + return nerrs; } +int main(int argc, char **argv) { + + int err; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 1; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "error precedence", opt, test_io); + + MPI_Finalize(); + + return err; +} diff --git a/test/testcases/file_create_open.c b/test/testcases/file_create_open.c index e4b40e435f..3b306a5cfd 100644 --- a/test/testcases/file_create_open.c +++ b/test/testcases/file_create_open.c @@ -10,76 +10,76 @@ #include #include #include -#include /* basename() */ #include #include #include -int main(int argc, char **argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - char filename[512]; - int i, err, nprocs, rank, nerrs=0, ncid; - int format[3] = {0, NC_64BIT_OFFSET, NC_64BIT_DATA}; + int err, nprocs, rank, nerrs=0, ncid; + MPI_Info info_dup=MPI_INFO_NULL; - MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Info_dup(info, &info_dup); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 512, "%s", argv[1]); - else sprintf(filename, "%s.nc", argv[0]); - - if (rank == 0) { - char *cmd_str = (char *)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for file create", basename(argv[0])); - printf("%-66s ------ ", cmd_str); - free(cmd_str); - } - - for (i=0; i<3; i++) { - /* Create a new file */ - int cmode = NC_CLOBBER | format[i]; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); - CHECK_ERR - - /* Close the file. */ - err = ncmpi_close(ncid); - CHECK_ERR - - /* Open the file */ - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_WRITE, MPI_INFO_NULL, &ncid); - CHECK_ERR - - /* Close the file. */ - err = ncmpi_close(ncid); - CHECK_ERR - } - - /* check if there is any malloc residue */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } - - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) - printf(FAIL_STR, nerrs); - else - printf(PASS_STR); - } + MPI_Info_set(info_dup, "nc_header_align_size", "100"); + MPI_Info_set(info_dup, "nc_var_align_size", "200"); + MPI_Info_set(info_dup, "nc_record_align_size", "300"); - MPI_Finalize(); + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + /* Create a new file */ + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info_dup, &ncid); + CHECK_ERR + + /* Close the file. */ + err = ncmpi_close(ncid); + CHECK_ERR + + /* Open the file */ + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_WRITE, info_dup, &ncid); + CHECK_ERR + + /* Close the file. */ + err = ncmpi_close(ncid); + CHECK_ERR + + if (info != MPI_INFO_NULL) MPI_Info_free(&info_dup); return (nerrs > 0); } + +int main(int argc, char **argv) { + + int err; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "file create/open", opt, test_io); + + MPI_Finalize(); + + return err; +} + diff --git a/test/testcases/flexible.c b/test/testcases/flexible.c index e1d6b10dc7..ab7a159cc9 100644 --- a/test/testcases/flexible.c +++ b/test/testcases/flexible.c @@ -42,13 +42,22 @@ #include #define NY 2 -#define NX 5 - -/*----< main() >------------------------------------------------------------*/ -int main(int argc, char **argv) { - - char filename[256]; - int i, j, err, ncid, varid1, varid2, varid3, dimids[2], debug=0; +#define NX 70 + +#define INDEP_MODE 0 +#define COLL_MODE 1 + +static int debug; + +/*----< tst_io() >-----------------------------------------------------------*/ +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ + int i, j, err, ncid, varid1, varid2, varid3, dimids[2]; int rank, nprocs, blocklengths[2], buf[NY][NX], *bufptr; int *ncbuf, req, st, nerrs=0; int array_of_sizes[2], array_of_subsizes[2], array_of_starts[2]; @@ -56,28 +65,15 @@ int main(int argc, char **argv) { MPI_Aint a0, a1, disps[2]; MPI_Datatype buftype; - MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for flexible put and get ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, MPI_INFO_NULL, - &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR /* define a 2D array */ err = ncmpi_def_dim(ncid, "Y", NC_UNLIMITED, &dimids[0]); CHECK_ERR @@ -108,6 +104,11 @@ int main(int argc, char **argv) { ncmpi_sync(ncid); #endif + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* initialize the contents of the array */ for (j=0; j 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "flexible APIs", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/testcases/flexible2.c b/test/testcases/flexible2.c index da265fda3d..fa815a846e 100644 --- a/test/testcases/flexible2.c +++ b/test/testcases/flexible2.c @@ -7,6 +7,8 @@ /* $Id$ */ /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * + * Similar to flexible.c, this program tests APIs with a need of type conversion. * * This program tests PnetCDF flexible APIs, ncmpi_put_vara_all(), * ncmpi_iput_vara() to write two 2D array variables (one is of 4-byte @@ -84,41 +86,32 @@ #define NZ 5 #define NY 5 -#define NX 5 +#define NX 70 -int main(int argc, char** argv) +/*----< tst_io() >-----------------------------------------------------------*/ +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - char filename[256]; int i, j, rank, nprocs, err, nerrs=0, req, status, ghost_len=3; - int ncid, cmode, varid0, varid1, dimid[3], *buf_zy, verbose=0; + int ncid, varid0, varid1, dimid[3], *buf_zy, verbose=0; int array_of_sizes[2], array_of_subsizes[2], array_of_starts[2]; double *buf_yx; MPI_Offset start[2], count[2]; - MPI_Datatype subarray; + MPI_Datatype subarray; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for flexible APIs ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR /* define 3 dimensions */ @@ -132,6 +125,11 @@ int main(int argc, char** argv) err = ncmpi_def_var(ncid, "var_yx", NC_FLOAT, 2, &dimid[1], &varid1); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* var_zy is partitioned along Z dimension */ array_of_sizes[0] = NZ + 2*ghost_len; array_of_sizes[1] = NY + 2*ghost_len; @@ -158,7 +156,10 @@ int main(int argc, char** argv) start[0] = NZ * rank; start[1] = 0; count[0] = NZ; count[1] = NY; /* calling a blocking flexible API */ - err = ncmpi_put_vara_all(ncid, varid0, start, count, buf_zy, 1, subarray); + if (coll_io) + err = ncmpi_put_vara_all(ncid, varid0, start, count, buf_zy, 1, subarray); + else + err = ncmpi_put_vara(ncid, varid0, start, count, buf_zy, 1, subarray); CHECK_ERR /* check the contents of put buffer */ @@ -166,12 +167,21 @@ int main(int argc, char** argv) if (buf_zy[i] != rank+10) { printf("Error at line %d in %s: put buffer[%d] is altered\n",__LINE__,__FILE__,i); nerrs++; + goto err_out; } } + /* file sync before reading */ + err = ncmpi_sync(ncid); + CHECK_ERR + MPI_Barrier(MPI_COMM_WORLD); + for (i=0; i 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } +err_out: + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "flexible API + type conversion", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/flexible_api.f b/test/testcases/flexible_api.f index 1c7c0990ac..f3f6c196ca 100644 --- a/test/testcases/flexible_api.f +++ b/test/testcases/flexible_api.f @@ -27,8 +27,8 @@ subroutine check(err, message) ! It is a good idea to check returned value for possible error if (err .NE. NF_NOERR) then write(6,*) message(1:XTRIM(message)), nfmpi_strerror(err) - msg = '*** TESTING F77 flexible_api.f for flexible API ' - call pass_fail(1, msg) + msg = '*** TESTING F77 flexible_api.f - flexible API ' + call pass_fail(1, msg, 0) STOP 2 end if end ! subroutine check @@ -38,7 +38,7 @@ program main include "mpif.h" include "pnetcdf.inc" - character(LEN=256) filename, cmd, msg + character(LEN=256) out_path, in_path, cmd, msg integer XTRIM integer err, ierr, nerrs, nprocs, rank, i, j integer cmode, ncid, varid, dimid(2), ghost_len, get_args @@ -50,26 +50,33 @@ program main integer array_of_sizes(2), array_of_subsizes(2) integer array_of_starts(2) integer*8 malloc_size, sum_size - logical verbose + logical verbose, keep_files + double precision timing + + call MPI_Init(ierr) + + timing = MPI_Wtime() - call MPI_Init(err) call MPI_Comm_rank(MPI_COMM_WORLD, rank, err) call MPI_Comm_size(MPI_COMM_WORLD, nprocs, err) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any if (rank .EQ. 0) then verbose = .TRUE. - filename = "testfile.nc" - ierr = get_args(cmd, filename) + out_path = "testfile.nc" + ierr = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(ierr, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, err) if (ierr .EQ. 0) goto 999 call MPI_Bcast(verbose, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, + err) - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, + MPI_COMM_WORLD, err) + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, + + ierr) + nerrs = 0 ! set parameters @@ -103,7 +110,7 @@ program main ! create file, truncate it if exists cmode = IOR(NF_CLOBBER, NF_64BIT_DATA) - err = nfmpi_create(MPI_COMM_WORLD, filename, cmode, + err = nfmpi_create(MPI_COMM_WORLD, out_path, cmode, + MPI_INFO_NULL, ncid) call check(err, 'In nfmpi_create: ') @@ -291,10 +298,18 @@ program main + sum_size, ' bytes yet to be freed' endif + timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, + + MPI_DOUBLE_PRECISION, MPI_MAX, + + MPI_COMM_WORLD, ierr) if (rank .eq. 0) then + if (.NOT. keep_files) then + err = nfmpi_delete(out_path, MPI_INFO_NULL) + end if + msg = '*** TESTING F77 '//cmd(1:XTRIM(cmd))// - + ' for bufcount=NF_COUNT_IGNORE & buftype predefined' - call pass_fail(nerrs, msg) + + ' - bufcount=NF_COUNT_IGNORE & buftype predefined' + call pass_fail(nerrs, msg, timing) endif 999 call MPI_Finalize(ierr) diff --git a/test/testcases/flexible_large_count.c b/test/testcases/flexible_large_count.c index 9c225518ae..95c011c1e3 100644 --- a/test/testcases/flexible_large_count.c +++ b/test/testcases/flexible_large_count.c @@ -139,51 +139,40 @@ } \ } -int main(int argc, char** argv) +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - char filename[256]; int i, j, rank, nprocs, err, nerrs=0, req, status; - int ncid, cmode, varid, dimid[2]; + int ncid, varid, dimid[2]; MPI_Count array_of_sizes[2], array_of_subsizes[2], array_of_starts[2]; int buf_ghost[NY+2*GHOST][NX+2*GHOST]; int buf[NY][NX]; MPI_Offset bufcount, start[2], count[2]; MPI_Datatype subarray; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for flexible var APIs ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); - CHECK_ERROUT + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR /* define 2 dimensions */ - err = ncmpi_def_dim(ncid, "Y", NY, &dimid[0]); CHECK_ERROUT - err = ncmpi_def_dim(ncid, "X", NX, &dimid[1]); CHECK_ERROUT + err = ncmpi_def_dim(ncid, "Y", NY, &dimid[0]); CHECK_ERR + err = ncmpi_def_dim(ncid, "X", NX, &dimid[1]); CHECK_ERR - /* define a variable of size NY * (NX * nprocs) */ - err = ncmpi_def_var(ncid, "var", NC_DOUBLE, 2, dimid, &varid); CHECK_ERROUT - err = ncmpi_enddef(ncid); CHECK_ERROUT + /* define a variable of size NY * NX */ + err = ncmpi_def_var(ncid, "var", NC_DOUBLE, 2, dimid, &varid); CHECK_ERR + err = ncmpi_enddef(ncid); CHECK_ERR - /* var is partitioned along X dimension in a matrix transported way */ + /* var is written in a matrix transported way */ array_of_sizes[0] = NY + 2*GHOST; array_of_sizes[1] = NX + 2*GHOST; array_of_subsizes[0] = NY; @@ -211,7 +200,7 @@ int main(int argc, char** argv) else /* other ranks write 0-sized data */ err = ncmpi_put_vara_all(ncid, varid, start, count, buf_ghost, 0, MPI_INT); MPI_Allreduce(MPI_IN_PLACE, &err, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); - CHECK_ERROUT + CHECK_ERR /* check the contents of put buffer. They should not be altered. */ CHECK_PUT_BUF_GHOST(NY, NX, GHOST, buf_ghost) @@ -219,7 +208,7 @@ int main(int argc, char** argv) /* read back and check the contents written in the file ----------------*/ INIT_GET_BUF(NY, NX, buf_ghost) err = ncmpi_get_var_all(ncid, varid, buf_ghost, bufcount, subarray); - CHECK_ERROUT + CHECK_ERR /* check the contents of get buffer */ CHECK_GET_BUF_GHOST(NY, NX, GHOST, buf_ghost) @@ -227,9 +216,9 @@ int main(int argc, char** argv) /* read back using a non-blocking flexible API --------------------------*/ INIT_GET_BUF(NY, NX, buf_ghost) err = ncmpi_iget_var(ncid, varid, buf_ghost, bufcount, subarray, &req); - CHECK_ERROUT - err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERROUT - err = status; CHECK_ERROUT + CHECK_ERR + err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR + err = status; CHECK_ERR /* check the contents of get buffer */ CHECK_GET_BUF_GHOST(NY, NX, GHOST, buf_ghost) @@ -240,17 +229,17 @@ int main(int argc, char** argv) /* calling a nonblocking put_var flexible API --------------------------*/ if (rank == 0) { /* only rank 0 writes to the variable */ err = ncmpi_iput_var(ncid, varid, buf_ghost, bufcount, subarray, &req); - CHECK_ERROUT + CHECK_ERR /* check the contents of put buffer. They should not be altered. */ CHECK_PUT_BUF_GHOST(NY, NX, GHOST, buf_ghost) } - err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERROUT - err = status; CHECK_ERROUT + err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR + err = status; CHECK_ERR /* read back and check the contents written in the file ----------------*/ INIT_GET_BUF(NY, NX, buf_ghost) err = ncmpi_get_var_all(ncid, varid, buf_ghost, bufcount, subarray); - CHECK_ERROUT + CHECK_ERR /* check the contents of get buffer */ CHECK_GET_BUF_GHOST(NY, NX, GHOST, buf_ghost) @@ -258,9 +247,9 @@ int main(int argc, char** argv) /* read back using a non-blocking flexible API --------------------------*/ INIT_GET_BUF(NY, NX, buf_ghost) err = ncmpi_iget_var(ncid, varid, buf_ghost, bufcount, subarray, &req); - CHECK_ERROUT - err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERROUT - err = status; CHECK_ERROUT + CHECK_ERR + err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR + err = status; CHECK_ERR /* check the contents of get buffer */ CHECK_GET_BUF_GHOST(NY, NX, GHOST, buf_ghost) @@ -279,7 +268,7 @@ int main(int argc, char** argv) else /* other ranks write 0-sized data */ err = ncmpi_put_vara_all(ncid, varid, start, count, buf, 0, MPI_INT); MPI_Allreduce(MPI_IN_PLACE, &err, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); - CHECK_ERROUT + CHECK_ERR /* check the contents of put buffer. They should not be altered. */ CHECK_PUT_BUF(NY, NX, buf) @@ -287,7 +276,7 @@ int main(int argc, char** argv) /* read back and check the contents written in the file ----------------*/ INIT_GET_BUF(NY, NX, buf) err = ncmpi_get_var_all(ncid, varid, buf, bufcount, MPI_INT); - CHECK_ERROUT + CHECK_ERR /* check the contents of get buffer */ CHECK_GET_BUF(NY, NX, buf) @@ -295,9 +284,9 @@ int main(int argc, char** argv) /* read back using a non-blocking flexible API --------------------------*/ INIT_GET_BUF(NY, NX, buf) err = ncmpi_iget_var(ncid, varid, buf, bufcount, MPI_INT, &req); - CHECK_ERROUT - err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERROUT - err = status; CHECK_ERROUT + CHECK_ERR + err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR + err = status; CHECK_ERR /* check the contents of get buffer */ CHECK_GET_BUF(NY, NX, buf) @@ -308,17 +297,17 @@ int main(int argc, char** argv) /* calling a nonblocking put_var flexible API --------------------------*/ if (rank == 0) { /* only rank 0 writes to the variable */ err = ncmpi_iput_var(ncid, varid, buf, bufcount, MPI_INT, &req); - CHECK_ERROUT + CHECK_ERR /* check the contents of put buffer. They should not be altered. */ CHECK_PUT_BUF(NY, NX, buf) } - err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERROUT - err = status; CHECK_ERROUT + err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR + err = status; CHECK_ERR /* read back and check the contents written in the file ----------------*/ INIT_GET_BUF(NY, NX, buf) err = ncmpi_get_var_all(ncid, varid, buf, bufcount, MPI_INT); - CHECK_ERROUT + CHECK_ERR /* check the contents of get buffer */ CHECK_GET_BUF(NY, NX, buf) @@ -326,18 +315,23 @@ int main(int argc, char** argv) /* read back using a non-blocking flexible API --------------------------*/ INIT_GET_BUF(NY, NX, buf) err = ncmpi_iget_var(ncid, varid, buf, bufcount, MPI_INT, &req); - CHECK_ERROUT - err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERROUT - err = status; CHECK_ERROUT + CHECK_ERR + err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR + err = status; CHECK_ERR /* check the contents of get buffer */ CHECK_GET_BUF(NY, NX, buf) + /* file sync before reading */ + err = ncmpi_sync(ncid); + CHECK_ERR + MPI_Barrier(MPI_COMM_WORLD); + /*----------------------------------------------------------------------*/ /*---- test independent I/O mode ---------------------------------------*/ /*----------------------------------------------------------------------*/ err = ncmpi_begin_indep_data(ncid); - CHECK_ERROUT + CHECK_ERR /*----------------------------------------------------------------------*/ /*---- test using bufcount == 1 with ghost cells -----------------------*/ @@ -350,19 +344,19 @@ int main(int argc, char** argv) /* calling a blocking put_var flexible API -----------------------------*/ if (rank == 0) { /* only rank 0 writes to the variable */ err = ncmpi_put_var(ncid, varid, buf_ghost, bufcount, subarray); - CHECK_ERROUT + CHECK_ERR /* check the contents of put buffer. They should not be altered. */ CHECK_PUT_BUF_GHOST(NY, NX, GHOST, buf_ghost) } /* file sync is required for non-zero ranks to see the data in file */ err = ncmpi_sync(ncid); - CHECK_ERROUT + CHECK_ERR /* read back and check the contents written in the file ----------------*/ INIT_GET_BUF(NY, NX, buf_ghost) err = ncmpi_get_var(ncid, varid, buf_ghost, bufcount, subarray); - CHECK_ERROUT + CHECK_ERR /* check the contents of get buffer */ CHECK_GET_BUF_GHOST(NY, NX, GHOST, buf_ghost) @@ -370,9 +364,9 @@ int main(int argc, char** argv) /* read back using a non-blocking flexible API --------------------------*/ INIT_GET_BUF(NY, NX, buf_ghost) err = ncmpi_iget_var(ncid, varid, buf_ghost, bufcount, subarray, &req); - CHECK_ERROUT - err = ncmpi_wait(ncid, 1, &req, &status); CHECK_ERROUT - err = status; CHECK_ERROUT + CHECK_ERR + err = ncmpi_wait(ncid, 1, &req, &status); CHECK_ERR + err = status; CHECK_ERR /* check the contents of get buffer */ CHECK_GET_BUF_GHOST(NY, NX, GHOST, buf_ghost) @@ -383,17 +377,17 @@ int main(int argc, char** argv) /* calling a nonblocking put_var flexible API --------------------------*/ if (rank == 0) { /* only rank 0 writes to the variable */ err = ncmpi_iput_var(ncid, varid, buf_ghost, bufcount, subarray, &req); - CHECK_ERROUT + CHECK_ERR /* check the contents of put buffer. They should not be altered. */ CHECK_PUT_BUF_GHOST(NY, NX, GHOST, buf_ghost) } - err = ncmpi_wait(ncid, 1, &req, &status); CHECK_ERROUT - err = status; CHECK_ERROUT + err = ncmpi_wait(ncid, 1, &req, &status); CHECK_ERR + err = status; CHECK_ERR /* read back and check the contents written in the file ----------------*/ INIT_GET_BUF(NY, NX, buf_ghost) err = ncmpi_get_var(ncid, varid, buf_ghost, bufcount, subarray); - CHECK_ERROUT + CHECK_ERR /* check the contents of get buffer */ CHECK_GET_BUF_GHOST(NY, NX, GHOST, buf_ghost) @@ -401,9 +395,9 @@ int main(int argc, char** argv) /* read back using a non-blocking flexible API --------------------------*/ INIT_GET_BUF(NY, NX, buf_ghost) err = ncmpi_iget_var(ncid, varid, buf_ghost, bufcount, subarray, &req); - CHECK_ERROUT - err = ncmpi_wait(ncid, 1, &req, &status); CHECK_ERROUT - err = status; CHECK_ERROUT + CHECK_ERR + err = ncmpi_wait(ncid, 1, &req, &status); CHECK_ERR + err = status; CHECK_ERR /* check the contents of get buffer */ CHECK_GET_BUF_GHOST(NY, NX, GHOST, buf_ghost) @@ -419,19 +413,19 @@ int main(int argc, char** argv) /* calling a blocking put_var flexible API -----------------------------*/ if (rank == 0) { /* only rank 0 writes to the variable */ err = ncmpi_put_var(ncid, varid, buf, bufcount, MPI_INT); - CHECK_ERROUT + CHECK_ERR /* check the contents of put buffer. They should not be altered. */ CHECK_PUT_BUF(NY, NX, buf) } /* file sync is required for non-zero ranks to see the data in file */ err = ncmpi_sync(ncid); - CHECK_ERROUT + CHECK_ERR /* read back and check the contents written in the file ----------------*/ INIT_GET_BUF(NY, NX, buf) err = ncmpi_get_var(ncid, varid, buf, bufcount, MPI_INT); - CHECK_ERROUT + CHECK_ERR /* check the contents of get buffer */ CHECK_GET_BUF(NY, NX, buf) @@ -439,9 +433,9 @@ int main(int argc, char** argv) /* read back using a non-blocking flexible API --------------------------*/ INIT_GET_BUF(NY, NX, buf) err = ncmpi_iget_var(ncid, varid, buf, bufcount, MPI_INT, &req); - CHECK_ERROUT - err = ncmpi_wait(ncid, 1, &req, &status); CHECK_ERROUT - err = status; CHECK_ERROUT + CHECK_ERR + err = ncmpi_wait(ncid, 1, &req, &status); CHECK_ERR + err = status; CHECK_ERR /* check the contents of get buffer */ CHECK_GET_BUF(NY, NX, buf) @@ -452,17 +446,17 @@ int main(int argc, char** argv) /* calling a nonblocking put_var flexible API --------------------------*/ if (rank == 0) { /* only rank 0 writes to the variable */ err = ncmpi_iput_var(ncid, varid, buf, bufcount, MPI_INT, &req); - CHECK_ERROUT + CHECK_ERR /* check the contents of put buffer. They should not be altered. */ CHECK_PUT_BUF(NY, NX, buf) } - err = ncmpi_wait(ncid, 1, &req, &status); CHECK_ERROUT - err = status; CHECK_ERROUT + err = ncmpi_wait(ncid, 1, &req, &status); CHECK_ERR + err = status; CHECK_ERR /* read back and check the contents written in the file ----------------*/ INIT_GET_BUF(NY, NX, buf) err = ncmpi_get_var(ncid, varid, buf, bufcount, MPI_INT); - CHECK_ERROUT + CHECK_ERR /* check the contents of get buffer */ CHECK_GET_BUF(NY, NX, buf) @@ -470,36 +464,42 @@ int main(int argc, char** argv) /* read back using a non-blocking flexible API --------------------------*/ INIT_GET_BUF(NY, NX, buf) err = ncmpi_iget_var(ncid, varid, buf, bufcount, MPI_INT, &req); - CHECK_ERROUT - err = ncmpi_wait(ncid, 1, &req, &status); CHECK_ERROUT - err = status; CHECK_ERROUT + CHECK_ERR + err = ncmpi_wait(ncid, 1, &req, &status); CHECK_ERR + err = status; CHECK_ERR /* check the contents of get buffer */ CHECK_GET_BUF(NY, NX, buf) MPI_Type_free(&subarray); - err = ncmpi_close(ncid); CHECK_ERROUT - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + err = ncmpi_close(ncid); CHECK_ERR -err_out: - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + return nerrs; +} + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "flexible var APIs", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/flexible_var.c b/test/testcases/flexible_var.c index ceb712eb29..8e6207c063 100644 --- a/test/testcases/flexible_var.c +++ b/test/testcases/flexible_var.c @@ -53,8 +53,12 @@ #include +/* Smaller NY and NX are for demonstration, producing outputs as shown above. #define NY 6 #define NX 4 +*/ +#define NY 32 +#define NX 128 #define GHOST 2 #define PRINT_PUT_BUF(Y, X, buf) { \ @@ -143,40 +147,23 @@ } \ } -int main(int argc, char** argv) +static +int tst_io(const char *out_path, + MPI_Info info) { - char filename[256]; int i, j, rank, nprocs, err, nerrs=0, req, status; - int ncid, cmode, varid, dimid[2]; + int ncid, varid, dimid[2]; int array_of_sizes[2], array_of_subsizes[2], array_of_starts[2]; int buf_ghost[NY+2*GHOST][NX+2*GHOST]; int buf[NY][NX]; MPI_Offset bufcount, start[2], count[2]; MPI_Datatype subarray; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for flexible var APIs ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERROUT /* define 2 dimensions */ @@ -485,25 +472,62 @@ int main(int argc, char** argv) err = ncmpi_close(ncid); CHECK_ERROUT - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } - err_out: - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } - - MPI_Finalize(); return (nerrs > 0); } +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) +{ + int err, nerrs=0; + MPI_Info info_dup; + + MPI_Info_dup(info, &info_dup); + + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + nerrs = tst_io(out_path, info_dup); + if (nerrs > 0) return nerrs; + + /* disable PnetCDF internal buffering */ + MPI_Info_set(info_dup, "nc_ibuf_size", "0"); + + nerrs = tst_io(out_path, info_dup); + if (nerrs > 0) return nerrs; + + MPI_Info_free(&info_dup); + + return nerrs; +} + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "flexible var APIs", opt, test_io); + + MPI_Finalize(); + + return err; +} diff --git a/test/testcases/flexible_varm.c b/test/testcases/flexible_varm.c index 03cb2ce161..b1591b4014 100644 --- a/test/testcases/flexible_varm.c +++ b/test/testcases/flexible_varm.c @@ -52,8 +52,8 @@ #include -#define NY 6 -#define NX 4 +#define NY 32 +#define NX 128 #define GHOST 2 #define INIT_PUT_BUF \ @@ -76,6 +76,7 @@ printf("Error at line %d in %s: put buffer altered buffer[%d][%d]=%d\n", \ __LINE__,__FILE__,i,j,buf[i][j]); \ nerrs++; \ + goto err_out; \ } \ } \ else { \ @@ -83,6 +84,7 @@ printf("Error at line %d in %s: put buffer altered buffer[%d][%d]=%d\n", \ __LINE__,__FILE__,i,j,buf[i][j]); \ nerrs++; \ + goto err_out; \ } \ } \ } \ @@ -102,6 +104,7 @@ printf("Unexpected get buffer[%d][%d]=%d\n", \ i,j,buf[i][j]); \ nerrs++; \ + goto err_out; \ } \ } \ else { \ @@ -109,44 +112,38 @@ printf("Unexpected get buffer[%d][%d]=%d\n", \ i,j,buf[i][j]); \ nerrs++; \ + goto err_out; \ } \ } \ } \ } -int main(int argc, char** argv) +#define INDEP_MODE 0 +#define COLL_MODE 1 + +static +int tst_varm(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - char filename[256]; int i, j, rank, nprocs, err, nerrs=0, req, status; - int ncid, cmode, varid, dimid[2]; + int ncid, varid, dimid[2]; int array_of_sizes[2], array_of_subsizes[2], array_of_starts[2]; int buf[NX+2*GHOST][NY+2*GHOST]; MPI_Offset start[2], count[2], stride[2], imap[2]; MPI_Datatype subarray; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for flexible varm APIs ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR /* define 2 dimensions */ @@ -157,6 +154,11 @@ int main(int argc, char** argv) err = ncmpi_def_var(ncid, "var", NC_DOUBLE, 2, dimid, &varid); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + start[0] = 0; start[1] = NX * rank; count[0] = NY; count[1] = NX; stride[0] = 1; stride[1] = 1; @@ -176,7 +178,11 @@ int main(int argc, char** argv) /* calling a blocking put_varm flexible API -----------------------------*/ /* initiate put buffer contents */ INIT_PUT_BUF - err = ncmpi_put_varm_all(ncid, varid, start, count, stride, imap, buf, + if (coll_io) + err = ncmpi_put_varm_all(ncid, varid, start, count, stride, imap, buf, + 1, subarray); + else + err = ncmpi_put_varm(ncid, varid, start, count, stride, imap, buf, 1, subarray); CHECK_ERR @@ -189,18 +195,35 @@ int main(int argc, char** argv) err = ncmpi_iput_varm(ncid, varid, start, count, stride, imap, buf, 1, subarray, &req); CHECK_ERR - err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR + + if (coll_io) + err = ncmpi_wait_all(ncid, 1, &req, &status); + else + err = ncmpi_wait(ncid, 1, &req, &status); + CHECK_ERR err = status; CHECK_ERR /* check the contents of put buffer */ CHECK_PUT_BUF + /* When running in independent data mode, flushing writes is necessary + * before reading the data back. + */ + MPI_Barrier(MPI_COMM_WORLD); + err = ncmpi_sync(ncid); + CHECK_ERR + MPI_Barrier(MPI_COMM_WORLD); + /* read back using a blocking get_varm flexible API ---------------------*/ /* initiate get buffer contents */ INIT_GET_BUF /* calling a blocking flexible API */ - err = ncmpi_get_varm_all(ncid, varid, start, count, stride, imap, buf, + if (coll_io) + err = ncmpi_get_varm_all(ncid, varid, start, count, stride, imap, buf, + 1, subarray); + else + err = ncmpi_get_varm(ncid, varid, start, count, stride, imap, buf, 1, subarray); CHECK_ERR @@ -215,7 +238,11 @@ int main(int argc, char** argv) err = ncmpi_iget_varm(ncid, varid, start, count, stride, imap, buf, 1, subarray, &req); CHECK_ERR - err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR + if (coll_io) + err = ncmpi_wait_all(ncid, 1, &req, &status); + else + err = ncmpi_wait(ncid, 1, &req, &status); + CHECK_ERR err = status; CHECK_ERR /* check the contents of get buffer */ @@ -225,24 +252,55 @@ int main(int argc, char** argv) err = ncmpi_close(ncid); CHECK_ERR - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } +err_out: + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ + int nerrs=0; - MPI_Finalize(); - return (nerrs > 0); + nerrs = tst_varm(out_path, in_path, format, coll_io, info); + if (nerrs > 0) goto err_out; + + /* disable PnetCDF internal buffering */ + MPI_Info_set(info, "nc_ibuf_size", "0"); + + nerrs = tst_varm(out_path, in_path, format, coll_io, info); + if (nerrs > 0) goto err_out; + +err_out: + return nerrs; } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "flexible varm APIs", opt, test_io); + + MPI_Finalize(); + + return err; +} diff --git a/test/testcases/inq_num_vars.c b/test/testcases/inq_num_vars.c index 6cc8f44d4d..9b3295c8d5 100644 --- a/test/testcases/inq_num_vars.c +++ b/test/testcases/inq_num_vars.c @@ -33,9 +33,9 @@ static int check_num_vars(int ncid, - int expected_nvars, - int expected_num_rec_vars, - int expected_num_fix_vars) + int expected_nvars, + int expected_num_rec_vars, + int expected_num_fix_vars) { int err, nerrs=0, nvars, num_rec_vars, num_fix_vars; @@ -66,15 +66,29 @@ int check_num_vars(int ncid, return nerrs; } -static int -tst_fmt(char *filename, int cmode) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) { - int nerrs=0, err, ncid, varid[7], dimid[3]; - MPI_Info info=MPI_INFO_NULL; + char val[MPI_MAX_INFO_VAL]; + int nerrs=0, err, flag, ncid, varid[7], dimid[3]; + + /* check whether burst buffering is enabled */ + MPI_Info_get(info, "nc_burst_buf", MPI_MAX_INFO_VAL - 1, val, &flag); + if (flag && strcasecmp(val, "enable") == 0 && + (format == NC_FORMAT_NETCDF4 || format == NC_FORMAT_NETCDF4_CLASSIC)) + /* does not work for NetCDF4 files when burst-buffering is enabled */ + return 0; + + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR /* create a new file for writing ----------------------------------------*/ - cmode |= NC_CLOBBER; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR /* define dimension and variable */ err = ncmpi_def_dim(ncid, "REC_DIM", NC_UNLIMITED, &dimid[0]); CHECK_ERR @@ -109,10 +123,15 @@ tst_fmt(char *filename, int cmode) nerrs += check_num_vars(ncid, 7, 4, 3); + /* file sync before reading */ + err = ncmpi_sync(ncid); + CHECK_ERR + MPI_Barrier(MPI_COMM_WORLD); + err = ncmpi_close(ncid); CHECK_ERR /* open the file for reading --------------------------------------------*/ - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, info, &ncid); + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR nerrs += check_num_vars(ncid, 7, 4, 3); @@ -122,64 +141,27 @@ tst_fmt(char *filename, int cmode) return nerrs; } -int main(int argc, char** argv) { - char filename[256], *hint_value; - int nerrs=0, rank, err, bb_enabled=0; - - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); +int main(int argc, char **argv) { - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - goto fn_exit; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for no. record/fixed variables", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + int err; + loop_opts opt; - /* printf("PnetCDF version string: \"%s\"\n", ncmpi_inq_libvers()); */ + MPI_Init(&argc, &argv); - /* check whether burst buffering is enabled */ - if (inq_env_hint("nc_burst_buf", &hint_value)) { - if (strcasecmp(hint_value, "enable") == 0) bb_enabled = 1; - free(hint_value); - } + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ - nerrs += tst_fmt(filename, 0); - nerrs += tst_fmt(filename, NC_64BIT_OFFSET); - if (!bb_enabled) { -#ifdef ENABLE_NETCDF4 - nerrs += tst_fmt(filename, NC_NETCDF4); - nerrs += tst_fmt(filename, NC_NETCDF4 | NC_CLASSIC_MODEL); -#endif - } - nerrs += tst_fmt(filename, NC_64BIT_DATA); - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + err = tst_main(argc, argv, "number of variables", opt, test_io); - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } - -fn_exit: MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/inq_num_varsf.f90 b/test/testcases/inq_num_varsf.f90 index 85c689aa95..7ba9385d73 100644 --- a/test/testcases/inq_num_varsf.f90 +++ b/test/testcases/inq_num_varsf.f90 @@ -46,8 +46,8 @@ subroutine check(err, message) ! It is a good idea to check returned value for possible error if (err .NE. NF90_NOERR) then write(6,*) trim(message), trim(nf90mpi_strerror(err)) - msg = '*** TESTING F90 inq_num_varsf.f90 for no. record/fixed variables' - call pass_fail(1, msg) + msg = '*** TESTING F90 inq_num_varsf.f90 - no. record/fixed variables' + call pass_fail(1, msg, 0) call MPI_Abort(MPI_COMM_WORLD, -1, err) end if end subroutine check @@ -57,29 +57,36 @@ program main use pnetcdf implicit none - character(LEN=256) filename, cmd, msg + character(LEN=256) out_path, in_path, cmd, msg integer err, ierr, nprocs, rank, cmode, ncid, get_args integer varid(7), dimid(3), dimid_1D(1), dimid_2D(2) integer nerrs, nvars, num_rec_vars, num_fix_vars, old_mode integer(kind=MPI_OFFSET_KIND) malloc_size, sum_size + logical keep_files + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any if (rank .EQ. 0) then - filename = "testfile.nc" - err = get_args(cmd, filename) + out_path = "testfile.nc" + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) ! create file, truncate it if exists cmode = IOR(NF90_CLOBBER, NF90_64BIT_OFFSET) - err = nf90mpi_create(MPI_COMM_WORLD, filename, cmode, & + err = nf90mpi_create(MPI_COMM_WORLD, out_path, cmode, & MPI_INFO_NULL, ncid) call check(err, 'In nf90mpi_create: ') @@ -152,7 +159,7 @@ program main call check(err, 'In nf90mpi_close: ') ! open the file just now created - err = nf90mpi_open(MPI_COMM_WORLD, filename, NF90_NOWRITE, & + err = nf90mpi_open(MPI_COMM_WORLD, out_path, NF90_NOWRITE, & MPI_INFO_NULL, ncid) call check(err, 'In nf90mpi_open: ') @@ -193,10 +200,18 @@ program main sum_size, ' bytes yet to be freed' endif + timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) if (rank .eq. 0) then + if (.NOT. keep_files) then + err = nfmpi_delete(out_path, MPI_INFO_NULL) + end if + msg = '*** TESTING F90 '//trim(cmd)// & - ' for no. record/fixed variables' - call pass_fail(nerrs, msg) + ' - no. record/fixed variables' + call pass_fail(nerrs, msg, timing) endif 999 call MPI_Finalize(ierr) diff --git a/test/testcases/inq_recsize.c b/test/testcases/inq_recsize.c index fec23ab2b9..cd99a5aaca 100644 --- a/test/testcases/inq_recsize.c +++ b/test/testcases/inq_recsize.c @@ -30,17 +30,30 @@ #include -static int -tst_fmt(char *filename, int cmode) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - int nerrs=0, err; - int ncid, varid[7], dimid[3]; + char val[MPI_MAX_INFO_VAL]; + int nerrs=0, err, flag, ncid, varid[7], dimid[3]; MPI_Offset expected_recsize, recsize; - MPI_Info info=MPI_INFO_NULL; + + /* check whether burst buffering is enabled */ + MPI_Info_get(info, "nc_burst_buf", MPI_MAX_INFO_VAL - 1, val, &flag); + if (flag && strcasecmp(val, "enable") == 0 && + (format == NC_FORMAT_NETCDF4 || format == NC_FORMAT_NETCDF4_CLASSIC)) + /* does not work for NetCDF4 files when burst-buffering is enabled */ + return 0; + + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR /* create a new file for writing ----------------------------------------*/ - cmode |= NC_CLOBBER; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR /* define dimension and variable */ err = ncmpi_def_dim(ncid, "REC_DIM", NC_UNLIMITED, &dimid[0]); CHECK_ERR @@ -85,64 +98,27 @@ tst_fmt(char *filename, int cmode) return nerrs; } -int main(int argc, char** argv) { - char filename[256], *hint_value; - int nerrs=0, rank, err, bb_enabled=0; - - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - goto fn_exit; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for inquiring record size ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } +int main(int argc, char **argv) { - /* printf("PnetCDF version string: \"%s\"\n", ncmpi_inq_libvers()); */ + int err; + loop_opts opt; - /* check whether burst buffering is enabled */ - if (inq_env_hint("nc_burst_buf", &hint_value)) { - if (strcasecmp(hint_value, "enable") == 0) bb_enabled = 1; - free(hint_value); - } + MPI_Init(&argc, &argv); - nerrs += tst_fmt(filename, 0); - nerrs += tst_fmt(filename, NC_64BIT_OFFSET); - if (!bb_enabled) { -#ifdef ENABLE_NETCDF4 - nerrs += tst_fmt(filename, NC_NETCDF4); - nerrs += tst_fmt(filename, NC_NETCDF4 | NC_CLASSIC_MODEL); -#endif - } - nerrs += tst_fmt(filename, NC_64BIT_DATA); - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + err = tst_main(argc, argv, "inquiring record size", opt, test_io); -fn_exit: MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/inq_recsizef.f90 b/test/testcases/inq_recsizef.f90 index 208b04df2f..8c7e37f9c5 100644 --- a/test/testcases/inq_recsizef.f90 +++ b/test/testcases/inq_recsizef.f90 @@ -29,8 +29,8 @@ subroutine check(err, message) ! It is a good idea to check returned value for possible error if (err .NE. NF90_NOERR) then write(6,*) trim(message), trim(nf90mpi_strerror(err)) - msg = '*** TESTING F90 inq_recsizef.f90 for inquiring record size' - call pass_fail(1, msg) + msg = '*** TESTING F90 inq_recsizef.f90 - inquiring record size' + call pass_fail(1, msg, 0) call MPI_Abort(MPI_COMM_WORLD, -1, err) end if end subroutine check @@ -40,29 +40,36 @@ program main use pnetcdf implicit none - character(LEN=256) filename, cmd, msg + character(LEN=256) out_path, in_path, cmd, msg integer err, ierr, nprocs, rank, cmode, ncid, nerrs, get_args integer varid(7), dimid(3), dimid_1D(1), dimid_2D(2), old_mode integer(kind=MPI_OFFSET_KIND) expected_recsize, recsize integer(kind=MPI_OFFSET_KIND) malloc_size, sum_size + logical keep_files + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any if (rank .EQ. 0) then - filename = "testfile.nc" - err = get_args(cmd, filename) + out_path = "testfile.nc" + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) ! create file, truncate it if exists cmode = IOR(NF90_CLOBBER, NF90_64BIT_OFFSET) - err = nf90mpi_create(MPI_COMM_WORLD, filename, cmode, & + err = nf90mpi_create(MPI_COMM_WORLD, out_path, cmode, & MPI_INFO_NULL, ncid) call check(err, 'In nf90mpi_create: ') @@ -138,9 +145,17 @@ program main sum_size, ' bytes yet to be freed' endif + timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) if (rank .eq. 0) then - msg = '*** TESTING F90 '//trim(cmd)//' for inquiring record size' - call pass_fail(nerrs, msg) + if (.NOT. keep_files) then + err = nfmpi_delete(out_path, MPI_INFO_NULL) + end if + + msg = '*** TESTING F90 '//trim(cmd)//' - inquiring record size' + call pass_fail(nerrs, msg, timing) endif 999 call MPI_Finalize(ierr) diff --git a/test/testcases/iput_all_kinds.m4 b/test/testcases/iput_all_kinds.m4 index 81be7b3447..1d337896db 100644 --- a/test/testcases/iput_all_kinds.m4 +++ b/test/testcases/iput_all_kinds.m4 @@ -9,7 +9,6 @@ dnl * See COPYRIGHT notice in top-level directory. * *********************************************************************/ -/* $Id: transpose.c 3078 2017-05-29 22:46:50Z wkliao $ */ #include #include @@ -21,7 +20,9 @@ dnl #include #define NDIMS 3 -#define LEN 2 +#define LEN 13 + +static int debug; include(`foreach.m4')dnl include(`utils.m4')dnl @@ -46,12 +47,153 @@ include(`utils.m4')dnl #define ulonglong unsigned long long #endif +#define INIT_BUF(buf, len) { \ + for (i=0; i YXZ */ + dimidsT[0] = dimids[1]; dimidsT[1] = dimids[2]; dimidsT[2] = dimids[0]; + err = ncmpi_def_var(ncid, "varm_$1", NC_TYPE($1), NDIMS, dimidsT, &varm_id); + CHECK_ERR + + return err; +} +')dnl + +foreach(`itype',(`text,schar,uchar,short,ushort,int,uint,long,float,double,longlong,ulonglong'),`DEFINE_VARS(itype)') + define(`TEST_NON_BLOCKING_PUT',dnl `dnl static int non_blocking_put_$1(int rank, + int coll_io, + int ncid, + MPI_Offset *gsize, + MPI_Offset *start, + MPI_Offset *count, + MPI_Offset *startS, + MPI_Offset *countS, + MPI_Offset *stride, + MPI_Offset *startM, + MPI_Offset *countM, + MPI_Offset *imap) +{ + int i, err=NC_NOERR, exp; + int var1_id, vara_id, vars_id, varm_id; + MPI_Offset start1[1]; + size_t bufsize, bufsizeS, bufsizeM; + + err = ncmpi_inq_varid(ncid, "var1_$1", &var1_id); CHECK_ERR + err = ncmpi_inq_varid(ncid, "vara_$1", &vara_id); CHECK_ERR + err = ncmpi_inq_varid(ncid, "vars_$1", &vars_id); CHECK_ERR + err = ncmpi_inq_varid(ncid, "varm_$1", &varm_id); CHECK_ERR + + bufsize = bufsizeS = bufsizeM = 1; + for (i=0; i YXZ */ - dimidsT[0] = dimids[1]; dimidsT[1] = dimids[2]; dimidsT[2] = dimids[0]; - err = ncmpi_def_var(ncid, "varm_$1", NC_TYPE($1), NDIMS, dimidsT, &varm_id); CHECK_ERR + MALLOC_ITYPE($1, *buf1, 1) + MALLOC_ITYPE($1, *bufa, bufsize) + MALLOC_ITYPE($1, *bufs, bufsizeS) + MALLOC_ITYPE($1, *bufm, bufsizeM) - /* exit the define mode */ - err = ncmpi_enddef(ncid); CHECK_ERR + ZERO_OUT_BUF(buf1, 1) + ZERO_OUT_BUF(bufa, bufsize) + ZERO_OUT_BUF(bufs, bufsizeS) + ZERO_OUT_BUF(bufm, bufsizeM) - /* write the whole variable in parallel */ + err = ncmpi_inq_varid(ncid, "var1_$1", &var1_id); CHECK_ERR + err = ncmpi_inq_varid(ncid, "vara_$1", &vara_id); CHECK_ERR + err = ncmpi_inq_varid(ncid, "vars_$1", &vars_id); CHECK_ERR + err = ncmpi_inq_varid(ncid, "varm_$1", &varm_id); CHECK_ERR + + /* write the variable in parallel */ start1[0] = rank; - err = `ncmpi_iput_var1_'ifelse(`$1',`text',`$1',`double')(ncid, var1_id, start1, buf, NULL); CHECK_ERR + err = `ncmpi_iget_var1_'$1(ncid, var1_id, start1, buf1, NULL); CHECK_ERR + + err = `ncmpi_iget_vara_'$1(ncid, vara_id, start, count, bufa, NULL); CHECK_ERR + + err = `ncmpi_iget_vars_'$1(ncid, vars_id, startS, countS, stride, bufs, NULL); CHECK_ERR + + err = `ncmpi_iget_varm_'$1(ncid, varm_id, startM, countM, NULL, imap, bufm, NULL); CHECK_ERR - err = `ncmpi_iput_vara_'ifelse(`$1',`text',`$1',`double')(ncid, vara_id, start, count, buf, NULL); CHECK_ERR + /* commit all nonblocking requests */ + if (!coll_io) + err = ncmpi_wait(ncid, NC_REQ_ALL, NULL, NULL); + else + err = ncmpi_wait_all(ncid, NC_REQ_ALL, NULL, NULL); + CHECK_ERR - err = `ncmpi_iput_vars_'ifelse(`$1',`text',`$1',`double')(ncid, vars_id, startS, countS, stride, buf, NULL); CHECK_ERR + /* Check read contents */ + exp = (rank + 1) % 128; + if (buf1[0] != exp) { + printf("Error %s at %d: buf1 expects %.f but got %.f\n", + __func__,__LINE__, (float)exp, (float)buf1[0]); + CHECK_ERR + } + for (i=0; i------------------------------------------------------------*/ -int main(int argc, char **argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - char filename[256], fname[512], *cbuf; - int i, j, k, rank, nprocs, ncid, bufsize, err, nerrs=0, cmode; + int i, rank, nprocs, ncid, err=NC_NOERR; int psize[NDIMS], dimids[NDIMS], dim_rank[NDIMS]; - double *buf; MPI_Offset _nprocs; MPI_Offset gsize[NDIMS], stride[NDIMS], imap[NDIMS]; MPI_Offset start[NDIMS], count[NDIMS]; MPI_Offset startS[NDIMS], countS[NDIMS]; MPI_Offset startM[NDIMS], countM[NDIMS]; - MPI_Info info; - MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for all kinds put APIs ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - /* calculate number of processes along each dimension */ for (i=0; i YXZ: (this is borrowed from examples/C/transpose.c */ imap[1] = 1; imap[0] = count[2]; imap[2] = count[1]*count[2]; startM[0] = start[1]; startM[1] = start[2]; startM[2] = start[0]; countM[0] = count[1]; countM[1] = count[2]; countM[2] = count[0]; + if (debug) + printf("startM=%lld %lld %lld countM=%lld %lld %lld stride=%lld %lld %lld imap=%lld %lld %lld\n", + start[0],start[1],start[2],count[0],count[1],count[2], + stride[0],stride[1],stride[2], imap[0],imap[1],imap[2]); - /* test CDF-1, 2, and 5 formats separately */ - TEST_CDF_FORMAT(NC_FORMAT_CLASSIC) - TEST_CDF_FORMAT(NC_FORMAT_64BIT_OFFSET) - TEST_CDF_FORMAT(NC_FORMAT_64BIT_DATA) - - free(cbuf); - free(buf); - MPI_Info_free(&info); - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + MPI_Info_set(info, "nc_var_align_size", "1"); - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR - MPI_Finalize(); - return (nerrs > 0); + TEST_CDF_FORMAT_PUT + TEST_CDF_FORMAT_GET + + return err; } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "all kinds iput APIs", opt, test_io); + + MPI_Finalize(); + + return err; +} diff --git a/test/testcases/ivarn.c b/test/testcases/ivarn.c index abb818e064..283e1d7935 100644 --- a/test/testcases/ivarn.c +++ b/test/testcases/ivarn.c @@ -153,11 +153,15 @@ int check_dbl_buf(double *buffer, double extra, int lineno) return 0; } -int main(int argc, char** argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - char filename[256]; int i, rank, nprocs, err, nerrs=0; - int ncid, cmode, dimid[2]; + int ncid, dimid[2]; int vari0001, vari0002, varr0001, varr0002, vard0001, vard0002; MPI_Offset **starts, **counts; int req[LEN], st[LEN], num_reqs=0; @@ -165,34 +169,20 @@ int main(int argc, char** argv) float rbuf1[LEN], rbuf2[LEN]; double dbuf1[LEN], dbuf2[LEN]; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for ncmpi_iput_varn_() ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - #ifdef DEBUG if (nprocs != 4 && rank == 0) printf("Warning: %s is intended to run on 4 processes\n",argv[0]); #endif + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "dim000001", LEN, &dimid[1]); CHECK_ERR err = ncmpi_def_dim(ncid, "time", NC_UNLIMITED, &dimid[0]); CHECK_ERR @@ -214,6 +204,11 @@ int main(int argc, char** argv) ncmpi_sync(ncid); #endif + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + if (nprocs < 4) { /* need 4 processes to fill the variables */ err = ncmpi_fill_var_rec(ncid, vari0001, 0); CHECK_ERR err = ncmpi_fill_var_rec(ncid, varr0001, 0); CHECK_ERR @@ -406,11 +401,20 @@ int main(int argc, char** argv) */ } - err = ncmpi_wait_all(ncid, num_reqs, req, st); CHECK_ERR + if (coll_io) + err = ncmpi_wait_all(ncid, num_reqs, req, st); + else + err = ncmpi_wait(ncid, num_reqs, req, st); + CHECK_ERR for (i=0; i 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "ncmpi_iput_varn_()", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/large_var_cdf5.c b/test/testcases/large_var_cdf5.c index 0c39afca0d..e8233778d5 100644 --- a/test/testcases/large_var_cdf5.c +++ b/test/testcases/large_var_cdf5.c @@ -28,33 +28,20 @@ #include -int main(int argc, char** argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - char filename[256]; - int rank, nprocs, err, nerrs=0, ncid, dimid[2], varid[2]; + int err, nerrs=0, ncid, dimid[2], varid[2]; - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for large var in CDF-5", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER|NC_64BIT_DATA, - MPI_INFO_NULL, &ncid); CHECK_ERR + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "dim0", NC_UNLIMITED, &dimid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "dim1", NC_MAX_INT64, &dimid[1]); CHECK_ERR @@ -67,24 +54,31 @@ int main(int argc, char** argv) err = ncmpi_set_fill(ncid, NC_NOFILL, NULL); CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } - - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + return nerrs; +} + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "large var in CDF-5", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/last_large_var.c b/test/testcases/last_large_var.c index 458b9cf07a..cdfb71583f 100644 --- a/test/testcases/last_large_var.c +++ b/test/testcases/last_large_var.c @@ -94,13 +94,13 @@ #endif static -int check_last_var(char *filename) +int check_last_var(const char *filename, MPI_Info info) { int err, nerrs=0, ncid, cmode, fill_mode, varid, dimid[4]; /* create a new file ---------------------------------------------------*/ cmode = NC_CLOBBER; - err = FileCreate(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = FileCreate(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR err = DefDim(ncid, "Y", NC_UNLIMITED, &dimid[0]); CHECK_ERR @@ -120,13 +120,13 @@ int check_last_var(char *filename) } static -int check_fix_var(char *filename) +int check_fix_var(const char *filename, MPI_Info info) { int err, nerrs=0, ncid, cmode, fill_mode, varid, dimid[4]; /* create a new CDF-1 file ----------------------------------------------*/ cmode = NC_CLOBBER; - err = FileCreate(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = FileCreate(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR err = DefDim(ncid, "X", 536870911, &dimid[0]); CHECK_ERR @@ -144,7 +144,7 @@ int check_fix_var(char *filename) /* create a new CDF-2 file ----------------------------------------------*/ cmode = NC_CLOBBER | NC_64BIT_OFFSET; - err = FileCreate(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = FileCreate(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR err = DefDim(ncid, "X", 536870911, &dimid[0]); CHECK_ERR @@ -165,13 +165,13 @@ int check_fix_var(char *filename) } static -int check_fix_rec_var(char *filename) +int check_fix_rec_var(const char *filename, MPI_Info info) { int err, nerrs=0, ncid, cmode, fill_mode, varid, dimid[4]; /* create a new file ---------------------------------------------------*/ cmode = NC_CLOBBER; - err = FileCreate(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = FileCreate(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR err = DefDim(ncid, "Y", NC_UNLIMITED, &dimid[0]); CHECK_ERR @@ -196,13 +196,13 @@ int check_fix_rec_var(char *filename) * record is less than 2 GiB - 4. */ static -int check_rec_var(char *filename, int cmode) +int check_rec_var(const char *filename, int cmode, MPI_Info info) { int err, nerrs=0, ncid, fill_mode, varid, dimid[3]; /* create a new file ---------------------------------------------------*/ cmode |= NC_CLOBBER; - err = FileCreate(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = FileCreate(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR err = DefDim(ncid, "Z", NC_UNLIMITED, &dimid[0]); CHECK_ERR @@ -218,7 +218,7 @@ int check_rec_var(char *filename, int cmode) /* create a new file ---------------------------------------------------*/ cmode |= NC_CLOBBER; - err = FileCreate(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = FileCreate(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR err = DefDim(ncid, "Z", NC_UNLIMITED, &dimid[0]); CHECK_ERR @@ -234,7 +234,7 @@ int check_rec_var(char *filename, int cmode) /* create a new file ---------------------------------------------------*/ cmode |= NC_CLOBBER; - err = FileCreate(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = FileCreate(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR err = DefDim(ncid, "Z", NC_UNLIMITED, &dimid[0]); CHECK_ERR @@ -262,13 +262,13 @@ int check_rec_var(char *filename, int cmode) * this variable must be less than about 2 GiB. */ static -int check_not_last_var(char *filename) +int check_not_last_var(const char *filename, MPI_Info info) { int err, nerrs=0, ncid, cmode, fill_mode, varid, dimid[4]; /* create a new file ---------------------------------------------------*/ cmode = NC_CLOBBER; - err = FileCreate(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = FileCreate(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR err = DefDim(ncid, "Y", NC_UNLIMITED, &dimid[0]); CHECK_ERR @@ -288,13 +288,13 @@ int check_not_last_var(char *filename) } static -int check_add_var(char *filename) +int check_add_var(const char *filename, MPI_Info info) { int err, nerrs=0, ncid, cmode, fill_mode, varid, dimid[4]; /* create a new file ---------------------------------------------------*/ cmode = NC_CLOBBER; - err = FileCreate(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = FileCreate(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR err = DefDim(ncid, "Y", NC_UNLIMITED, &dimid[0]); CHECK_ERR @@ -320,13 +320,13 @@ int check_add_var(char *filename) } static -int check_var_offset(char *filename) +int check_var_offset(const char *filename, MPI_Info info) { int err, nerrs=0, ncid, cmode, fill_mode, varid, dimid[4]; /* create a new file ---------------------------------------------------*/ cmode = NC_CLOBBER; - err = FileCreate(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = FileCreate(MPI_COMM_WORLD, filename, cmode, info, &ncid); CHECK_ERR err = DefDim(ncid, "Y", NC_UNLIMITED, &dimid[0]); CHECK_ERR @@ -359,64 +359,58 @@ int check_var_offset(char *filename) return nerrs; } -int main(int argc, char** argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, /* ignored */ + int coll_io, /* ignored */ + MPI_Info info) { - char filename[256]; - int rank=0, nprocs=1, err, nerrs=0; + int nerrs=0; - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for last large var in CDF-1/2", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - - nerrs += check_fix_var(filename); - nerrs += check_last_var(filename); - nerrs += check_fix_rec_var(filename); - nerrs += check_rec_var(filename, 0); - nerrs += check_rec_var(filename, NC_64BIT_OFFSET); - nerrs += check_rec_var(filename, NC_64BIT_DATA); - nerrs += check_not_last_var(filename); - nerrs += check_add_var(filename); - nerrs += check_var_offset(filename); + nerrs += check_fix_var(out_path, info); + nerrs += check_last_var(out_path, info); + nerrs += check_fix_rec_var(out_path, info); + + nerrs += check_rec_var(out_path, 0, info); + nerrs += check_rec_var(out_path, NC_64BIT_OFFSET, info); + nerrs += check_rec_var(out_path, NC_64BIT_DATA, info); + + nerrs += check_not_last_var(out_path, info); + nerrs += check_add_var(out_path, info); + nerrs += check_var_offset(out_path, info); #ifdef TEST_NETCDF if (nerrs) printf("fail with %d mismatches\n",nerrs); else printf("pass\n"); -#else - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } - - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } #endif - MPI_Finalize(); - return (nerrs > 0); + return nerrs; } +int main(int argc, char **argv) { + + int err; + int formats[] = {0}; + + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 1; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "last large var in CDF-1/2", opt, test_io); + + MPI_Finalize(); + + return err; +} diff --git a/test/testcases/mix_collectives.c b/test/testcases/mix_collectives.c index 583a295997..c453c18d0b 100644 --- a/test/testcases/mix_collectives.c +++ b/test/testcases/mix_collectives.c @@ -13,10 +13,13 @@ #include -/*----< main() >------------------------------------------------------------*/ -int main(int argc, char **argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) { - char filename[256]; int i, j, err, nerrs=0, rank, nprocs; int ncid, dimid[2], varid, varids[4]; MPI_Offset start[2], count[2], stride[2], imap[2]; @@ -35,32 +38,19 @@ int main(int argc, char **argv) 204, NC_FILL_INT, 205, NC_FILL_INT, 304, 310, 316, 322, NC_FILL_INT, NC_FILL_INT, NC_FILL_INT, NC_FILL_INT, 305, 311, 317, 323}; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for get/put varm ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - #ifdef DEBUG if (nprocs > 1 && rank == 0) printf("Warning: %s is designed to run on 1 process\n", argv[0]); #endif - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER | NC_64BIT_DATA, - MPI_INFO_NULL, &ncid); CHECK_ERR + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR /* define a variable of a 6 x 4 integer array in the nc file */ err = ncmpi_def_dim(ncid, "Y", 12, &dimid[0]); CHECK_ERR @@ -165,7 +155,7 @@ int main(int argc, char **argv) err = ncmpi_close(ncid); CHECK_ERR - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_WRITE, MPI_INFO_NULL, &ncid); CHECK_ERR + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_WRITE, info, &ncid); CHECK_ERR err = ncmpi_inq_varid(ncid, "var", &varid); CHECK_ERR @@ -181,7 +171,7 @@ int main(int argc, char **argv) __LINE__,__FILE__,i,g_buf[i],check_buf[i]); nerrs++; free(check_buf); - goto err_out; + goto syn_err; } } } @@ -207,7 +197,7 @@ int main(int argc, char **argv) printf("Error at line %d in %s: expecting var[%d]=%d but got %d\n", __LINE__,__FILE__,i, j*4+i + rank*100, buf[j][i]); nerrs++; - goto err_out; + goto syn_err; } } } @@ -222,7 +212,7 @@ int main(int argc, char **argv) printf("Error at line %d in %s: expecting var[%d]=%d but got %d\n", __LINE__,__FILE__,j, j+rank*100, *val); nerrs++; - goto err_out; + goto syn_err; } val++; } @@ -238,7 +228,7 @@ int main(int argc, char **argv) printf("Error at line %d in %s: expecting var[%d][%d]=%d but got %d\n", __LINE__,__FILE__,j,i, j*4+i + rank*100, buf[j][i]); nerrs++; - goto err_out; + goto syn_err; } } } @@ -251,11 +241,15 @@ int main(int argc, char **argv) printf("Error at line %d in %s: expecting var[%d][%d]=%d but got %d\n", __LINE__,__FILE__,j,i, -1, buf[j][i]); nerrs++; - goto err_out; + goto syn_err; } } } +syn_err: + MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); + if (nerrs) goto err_out; + /* test when different processes call put APIs with different varid */ err = ncmpi_redef(ncid); CHECK_ERR err = ncmpi_def_var(ncid, "scalar0", NC_INT, 0, NULL, &varids[0]); CHECK_ERR @@ -277,24 +271,31 @@ int main(int argc, char **argv) err_out: err = ncmpi_close(ncid); CHECK_ERR - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 1; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "mixed kind collective APIs", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/modes.c b/test/testcases/modes.c index 23d04099e7..5c99c7669f 100644 --- a/test/testcases/modes.c +++ b/test/testcases/modes.c @@ -33,7 +33,11 @@ } static -int check_modes(char *filename) +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, /* ignored */ + int coll_io, /* ignored */ + MPI_Info info) { int rank, err, nerrs=0, ncid, cmode; char *path; @@ -43,27 +47,32 @@ int check_modes(char *filename) /* delete the file and ignore error */ /* remove the file system type prefix name if there is any. For example, - * when filename = "lustre:/home/foo/testfile.nc", remove "lustre:" to make + * when out_path = "lustre:/home/foo/testfile.nc", remove "lustre:" to make * path pointing to "/home/foo/testfile.nc", so it can be used in POSIX * unlink() and access() below */ - path = remove_file_system_type_prefix(filename); + path = remove_file_system_type_prefix(out_path); + + MPI_Barrier(MPI_COMM_WORLD); if (rank == 0) unlink(path); + MPI_Barrier(MPI_COMM_WORLD); /* create a new file and test various cmodes ----------------------------*/ /* It is illegal to use both NC_64BIT_OFFSET and NC_64BIT_DATA together */ cmode = NC_CLOBBER | NC_64BIT_OFFSET | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = ncmpi_create(MPI_COMM_WORLD, out_path, cmode, info, &ncid); EXP_ERR(NC_EINVAL_CMODE) + MPI_Barrier(MPI_COMM_WORLD); + /* The file should not be created */ if (rank == 0) { if (access(path, F_OK) == 0) { printf("Error at %s:%d : file (%s) should not be created\n", - __FILE__,__LINE__, filename); + __FILE__,__LINE__, out_path); nerrs++; /* delete the file and ignore error */ unlink(path); @@ -72,47 +81,61 @@ int check_modes(char *filename) } MPI_Barrier(MPI_COMM_WORLD); -#ifdef ENABLE_NETCDF4 - /* It is illegal to use both NC_64BIT_OFFSET and NC_NETCDF4 together */ - cmode = NC_CLOBBER | NC_64BIT_OFFSET | NC_NETCDF4; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); - EXP_ERR(NC_EINVAL_CMODE) + if (format == NC_FORMAT_NETCDF4 || format == NC_FORMAT_NETCDF4_CLASSIC) { + /* It is illegal to use both NC_64BIT_OFFSET and NC_NETCDF4 together */ + if (format == NC_FORMAT_NETCDF4_CLASSIC) + cmode = NC_CLOBBER | NC_64BIT_OFFSET | NC_NETCDF4 | NC_CLASSIC_MODEL; + else + cmode = NC_CLOBBER | NC_64BIT_OFFSET | NC_NETCDF4; - /* The file should not be created */ - if (rank == 0) { - if (access(path, F_OK) == 0) { - printf("Error at %s:%d : file (%s) should not be created\n", - __FILE__,__LINE__, filename); - nerrs++; - /* delete the file and ignore error */ - unlink(path); + err = ncmpi_create(MPI_COMM_WORLD, out_path, cmode, info, &ncid); + EXP_ERR(NC_EINVAL_CMODE) + + MPI_Barrier(MPI_COMM_WORLD); + + /* The file should not be created */ + if (rank == 0) { + if (access(path, F_OK) == 0) { + printf("Error at %s:%d : file (%s) should not be created\n", + __FILE__,__LINE__, out_path); + nerrs++; + /* delete the file and ignore error */ + unlink(path); + } + /* else : file does not exist */ } - /* else : file does not exist */ - } - MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); - /* It is illegal to use both NC_64BIT_DATA and NC_NETCDF4 together */ - cmode = NC_CLOBBER | NC_64BIT_DATA | NC_NETCDF4; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); - EXP_ERR(NC_EINVAL_CMODE) + /* It is illegal to use both NC_64BIT_DATA and NC_NETCDF4 together */ + if (format == NC_FORMAT_NETCDF4_CLASSIC) + cmode = NC_CLOBBER | NC_64BIT_DATA | NC_NETCDF4 | NC_CLASSIC_MODEL; + else + cmode = NC_CLOBBER | NC_64BIT_DATA | NC_NETCDF4; - /* The file should not be created */ - if (rank == 0) { - if (access(path, F_OK) == 0) { - printf("Error at %s:%d : file (%s) should not be created\n", - __FILE__,__LINE__, filename); - nerrs++; - /* delete the file and ignore error */ - unlink(path); + err = ncmpi_create(MPI_COMM_WORLD, out_path, cmode, info, &ncid); + EXP_ERR(NC_EINVAL_CMODE) + + MPI_Barrier(MPI_COMM_WORLD); + + /* The file should not be created */ + if (rank == 0) { + if (access(path, F_OK) == 0) { + printf("Error at %s:%d : file (%s) should not be created\n", + __FILE__,__LINE__, out_path); + nerrs++; + /* delete the file and ignore error */ + unlink(path); + } + /* else : file does not exist */ } - /* else : file does not exist */ + MPI_Barrier(MPI_COMM_WORLD); } - MPI_Barrier(MPI_COMM_WORLD); -#endif /* Collectively opening a non-existing file for read, expect error code * NC_ENOENT on all processes */ - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid); + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_NOWRITE, info, &ncid); + + MPI_Barrier(MPI_COMM_WORLD); /* When using MVAPICH2 2.2, its Lustre driver adds O_CREAT to all open * calls. This is considered a bug in an MPI-IO implementation. Due to this @@ -131,7 +154,7 @@ int check_modes(char *filename) if (rank == 0) { if (access(path, F_OK) == 0) { printf("Error at line %d in %s: file (%s) should not be created\n", - __LINE__,__FILE__, filename); + __LINE__,__FILE__, out_path); nerrs++; /* delete the file and ignore error */ unlink(path); @@ -143,7 +166,9 @@ int check_modes(char *filename) /* Collectively opening a non-existing file for write, expect error code * NC_ENOENT on all processes */ - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_WRITE, MPI_INFO_NULL, &ncid); + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_WRITE, info, &ncid); + + MPI_Barrier(MPI_COMM_WORLD); /* When using MVAPICH2 2.2, its Lustre driver adds O_CREAT to all open * calls. This is considered a bug in an MPI-IO implementation. Due to this @@ -162,7 +187,7 @@ int check_modes(char *filename) if (rank == 0) { if (access(path, F_OK) == 0) { printf("Error at line %d in %s: file (%s) should not be created\n", - __LINE__,__FILE__, filename); + __LINE__,__FILE__, out_path); nerrs++; /* delete the file and ignore error */ unlink(path); @@ -172,66 +197,34 @@ int check_modes(char *filename) } MPI_Barrier(MPI_COMM_WORLD); - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, MPI_INFO_NULL, &ncid); + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR return nerrs; } -int main(int argc, char** argv) -{ - char *filename=NULL; - int len, rank, err, nerrs=0; - - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); +int main(int argc, char **argv) { - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) filename = strdup(argv[1]); - else filename = strdup("testfile.nc"); - len = (int)strlen(filename) + 1; - MPI_Bcast(&len, 1, MPI_INT, 0, MPI_COMM_WORLD); - MPI_Bcast(filename, len, MPI_CHAR, 0, MPI_COMM_WORLD); + int err; + loop_opts opt; - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for file create/open modes ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + MPI_Init(&argc, &argv); - /* test under safe mode enabled */ - setenv("PNETCDF_SAFE_MODE", "1", 1); - nerrs += check_modes(filename); - - /* test under safe mode disabled */ - setenv("PNETCDF_SAFE_MODE", "0", 1); - nerrs += check_modes(filename); - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 0; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 0; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } - free(filename); + err = tst_main(argc, argv, "file create/open modes", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/ncmpi_vars_null_stride.c b/test/testcases/ncmpi_vars_null_stride.c index 0cfd81aacd..311c3ae98e 100644 --- a/test/testcases/ncmpi_vars_null_stride.c +++ b/test/testcases/ncmpi_vars_null_stride.c @@ -30,11 +30,15 @@ #define NY 4 #define NX 2 -static int -tst_fmt(char *filename, int cmode) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { int err, nerrs=0, ncid, dimid[NDIMS], varid[5], ndims=NDIMS; - int i, j, k, nprocs, rank, req, *buf; + int i, j, k, nprocs, rank, req, *buf=NULL; MPI_Offset start[NDIMS] = {0}; MPI_Offset count[NDIMS] = {0}; MPI_Offset stride[NDIMS] = {0}; @@ -42,8 +46,11 @@ tst_fmt(char *filename, int cmode) MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - cmode |= NC_CLOBBER; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); CHECK_ERR + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + if (err != NC_NOERR) goto err_out; + + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "Y", NY, &dimid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "X", nprocs*NX, &dimid[1]); CHECK_ERR @@ -54,17 +61,30 @@ tst_fmt(char *filename, int cmode) err = ncmpi_def_var(ncid, "v4", NC_INT, ndims, dimid, &varid[4]); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); CHECK_ERR + } + + buf = (int*) malloc(sizeof(int) * NY * NX); + for (i=0; i 0); } -int main(int argc, char **argv) -{ - char filename[256], *hint_value; - int rank, err, nerrs=0, bb_enabled=0; +int main(int argc, char **argv) { - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); + int err; + loop_opts opt; - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for NULL stride ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - - /* check whether burst buffering is enabled */ - if (inq_env_hint("nc_burst_buf", &hint_value)) { - if (strcasecmp(hint_value, "enable") == 0) bb_enabled = 1; - free(hint_value); - } + MPI_Init(&argc, &argv); - nerrs += tst_fmt(filename, 0); - if (nerrs) goto fn_exit; - nerrs += tst_fmt(filename, NC_64BIT_OFFSET); - if (nerrs) goto fn_exit; - if (!bb_enabled) { -#ifdef ENABLE_NETCDF4 - nerrs += tst_fmt(filename, NC_NETCDF4); - if (nerrs) goto fn_exit; - nerrs += tst_fmt(filename, NC_NETCDF4 | NC_CLASSIC_MODEL); - if (nerrs) goto fn_exit; -#endif - } - nerrs += tst_fmt(filename, NC_64BIT_DATA); - if (nerrs) goto fn_exit; - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ -fn_exit: - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + err = tst_main(argc, argv, "NULL stride", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + + return err; } + diff --git a/test/testcases/noclobber.c b/test/testcases/noclobber.c index 23f17598b9..75c3ca252c 100644 --- a/test/testcases/noclobber.c +++ b/test/testcases/noclobber.c @@ -20,86 +20,64 @@ #include -static int -tst_fmt(char *filename, int fmt_flag) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - int err, nerrs=0, ncid, cmode; + int err, nerrs=0, ncid; MPI_Barrier(MPI_COMM_WORLD); + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + /* create a file if it does not exist */ - cmode = NC_CLOBBER | fmt_flag; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR + + err = ncmpi_enddef(ncid); CHECK_ERR + + /* file sync before reading */ + err = ncmpi_sync(ncid); + CHECK_ERR + MPI_Barrier(MPI_COMM_WORLD); + err = ncmpi_close(ncid); CHECK_ERR /* now the file exists, test if PnetCDF can return correct error code */ - cmode = NC_NOCLOBBER | fmt_flag; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_NOCLOBBER, info, &ncid); EXP_ERR(NC_EEXIST) /* err == NC_EOFILE */ + MPI_Barrier(MPI_COMM_WORLD); + return nerrs; } int main(int argc, char **argv) { - char *filename, *hint_value; - int err, nerrs=0, len, rank, bb_enabled=0; + + int err; + loop_opts opt; MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) filename = strdup(argv[1]); - else filename = strdup("testfile.nc"); - len = (int)strlen(filename) + 1; - MPI_Bcast(&len, 1, MPI_INT, 0, MPI_COMM_WORLD); - MPI_Bcast(filename, len, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for NC_NOCLOBBER and NC_EEXIST ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - - /* check whether burst buffering is enabled */ - if (inq_env_hint("nc_burst_buf", &hint_value)) { - if (strcasecmp(hint_value, "enable") == 0) bb_enabled = 1; - free(hint_value); - } - - nerrs += tst_fmt(filename, 0); - nerrs += tst_fmt(filename, NC_64BIT_OFFSET); - if (!bb_enabled) { -#ifdef ENABLE_NETCDF4 - nerrs += tst_fmt(filename, NC_NETCDF4); - nerrs += tst_fmt(filename, NC_NETCDF4|NC_CLASSIC_MODEL); -#endif - } - nerrs += tst_fmt(filename, NC_64BIT_DATA); - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } - - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } - free(filename); + + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "NC_NOCLOBBER and NC_EEXIST", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/testcases/nonblocking.c b/test/testcases/nonblocking.c index 169d7f45b2..a189d8b3fc 100644 --- a/test/testcases/nonblocking.c +++ b/test/testcases/nonblocking.c @@ -43,44 +43,25 @@ #define NY 4 #define NX 5 -/*----< main() >------------------------------------------------------------*/ -int main(int argc, char **argv) { - - char filename[256]; - int i, j, err, ncid, varid, dimids[2], req[2], st[2], nerrs=0; - int rank, nprocs, buf[NY+1][NX]; +int tst_iput(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ + int i, j, err, ncid, varid, dimids[2], req[2], st[2], nerrs=0; + int rank, nprocs, buf[NY+1][NX]; MPI_Offset start[2], count[2]; - MPI_Info info; - MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for using ncmpi_iput_vara_int() ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR - MPI_Info_create(&info); - /* When using PVFS2, unexpected buffer value error message might occur. - * This is due to a possible bug in ADIOI_PVFS2_OldWriteStrided() when - * filetype is contiguous and buftype is non-contiguous. - * Fix: Add ROMIO hint to force ADIO driever to use POSIX I/O */ - /* MPI_Info_set(info, "romio_pvfs2_posix_write", "enable"); */ - - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, info, &ncid); CHECK_ERR - MPI_Info_free(&info); + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_FATAL_ERR /* define a 2D array */ err = ncmpi_def_dim(ncid, "Y", NC_UNLIMITED, &dimids[0]); CHECK_ERR @@ -88,6 +69,10 @@ int main(int argc, char **argv) { err = ncmpi_def_var(ncid, "var", NC_INT, 2, dimids, &varid); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_sync(ncid); CHECK_ERR + } + /* initialize the contents of the array */ for (j=0; j 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } +fn_exit: + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ + int nerrs=0; + + nerrs = tst_iput(out_path, in_path, format, coll_io, info); + if (nerrs > 0) goto err_out; + + /* disable PnetCDF internal buffering */ + MPI_Info_set(info, "nc_ibuf_size", "0"); + + nerrs = tst_iput(out_path, in_path, format, coll_io, info); + if (nerrs > 0) goto err_out; + +err_out: + return nerrs; +} + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "ncmpi_iput_vara_int()", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/testcases/null_args.m4 b/test/testcases/null_args.m4 index 65484f3802..8627ef3353 100644 --- a/test/testcases/null_args.m4 +++ b/test/testcases/null_args.m4 @@ -66,186 +66,191 @@ define(`TEST_NULL_ARGS',` memset($1_buf, 0, 100*sizeof($1)); /*---- test put_var1 ---- */ - err = ncmpi_put_var1_$1_all(ncid, vid_$1, start, $1_buf); - EXP_ERR_MSG(NC_NOERR, "put_var1_$1_all") + err = ncmpi_put_var1_$1$2(ncid, vid_$1, start, $1_buf); + EXP_ERR_MSG(NC_NOERR, "put_var1_$1$2") - err = ncmpi_put_var1_$1_all(ncid, vid_$1, NULL, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "put_var1_$1_all start=NULL") + err = ncmpi_put_var1_$1$2(ncid, vid_$1, NULL, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "put_var1_$1$2 start=NULL") /*---- test put_vara ---- */ - err = ncmpi_put_vara_$1_all(ncid, vid_$1, start, count, $1_buf); - EXP_ERR_MSG(NC_NOERR, "put_vara_$1_all") + err = ncmpi_put_vara_$1$2(ncid, vid_$1, start, count, $1_buf); + EXP_ERR_MSG(NC_NOERR, "put_vara_$1$2") - err = ncmpi_put_vara_$1_all(ncid, vid_$1, NULL, count, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "put_vara_$1_all start=NULL") + err = ncmpi_put_vara_$1$2(ncid, vid_$1, NULL, count, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "put_vara_$1$2 start=NULL") - err = ncmpi_put_vara_$1_all(ncid, vid_$1, start, NULL, $1_buf); - EXP_ERR_MSG(NC_EEDGE, "put_vara_$1_all count=NULL") + err = ncmpi_put_vara_$1$2(ncid, vid_$1, start, NULL, $1_buf); + EXP_ERR_MSG(NC_EEDGE, "put_vara_$1$2 count=NULL") - err = ncmpi_put_vara_$1_all(ncid, vid_$1, NULL, NULL, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "put_vara_$1_all start=count=NULL") + err = ncmpi_put_vara_$1$2(ncid, vid_$1, NULL, NULL, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "put_vara_$1$2 start=count=NULL") /*---- test put_vars ---- */ - err = ncmpi_put_vars_$1_all(ncid, vid_$1, start, count, stride, $1_buf); - EXP_ERR_MSG(NC_NOERR, "put_vars_$1_all") + err = ncmpi_put_vars_$1$2(ncid, vid_$1, start, count, stride, $1_buf); + EXP_ERR_MSG(NC_NOERR, "put_vars_$1$2") - err = ncmpi_put_vars_$1_all(ncid, vid_$1, NULL, count, stride, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "put_vars_$1_all start=NULL") + err = ncmpi_put_vars_$1$2(ncid, vid_$1, NULL, count, stride, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "put_vars_$1$2 start=NULL") - err = ncmpi_put_vars_$1_all(ncid, vid_$1, start, NULL, stride, $1_buf); - EXP_ERR_MSG(NC_EEDGE, "put_vars_$1_all count=NULL") + err = ncmpi_put_vars_$1$2(ncid, vid_$1, start, NULL, stride, $1_buf); + EXP_ERR_MSG(NC_EEDGE, "put_vars_$1$2 count=NULL") - err = ncmpi_put_vars_$1_all(ncid, vid_$1, start, count, NULL, $1_buf); - EXP_ERR_MSG(NC_NOERR, "put_vars_$1_all stride=NULL") + err = ncmpi_put_vars_$1$2(ncid, vid_$1, start, count, NULL, $1_buf); + EXP_ERR_MSG(NC_NOERR, "put_vars_$1$2 stride=NULL") - err = ncmpi_put_vars_$1_all(ncid, vid_$1, NULL, NULL, stride, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "put_vars_$1_all start=count=NULL") + err = ncmpi_put_vars_$1$2(ncid, vid_$1, NULL, NULL, stride, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "put_vars_$1$2 start=count=NULL") - err = ncmpi_put_vars_$1_all(ncid, vid_$1, NULL, count, NULL, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "put_vars_$1_all start=stride=NULL") + err = ncmpi_put_vars_$1$2(ncid, vid_$1, NULL, count, NULL, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "put_vars_$1$2 start=stride=NULL") - err = ncmpi_put_vars_$1_all(ncid, vid_$1, start, NULL, NULL, $1_buf); - EXP_ERR_MSG(NC_EEDGE, "put_vars_$1_all count=stride=NULL") + err = ncmpi_put_vars_$1$2(ncid, vid_$1, start, NULL, NULL, $1_buf); + EXP_ERR_MSG(NC_EEDGE, "put_vars_$1$2 count=stride=NULL") - err = ncmpi_put_vars_$1_all(ncid, vid_$1, NULL, NULL, NULL, $1_buf); + err = ncmpi_put_vars_$1$2(ncid, vid_$1, NULL, NULL, NULL, $1_buf); EXP_ERR_MSG(NC_EINVALCOORDS, "put_vars start=count=stride=NULL") /*---- test put_varm ---- */ - err = ncmpi_put_varm_$1_all(ncid, vid_$1, start, count, stride, imap, $1_buf); - EXP_ERR_MSG(NC_NOERR, "put_varm_$1_all") + err = ncmpi_put_varm_$1$2(ncid, vid_$1, start, count, stride, imap, $1_buf); + EXP_ERR_MSG(NC_NOERR, "put_varm_$1$2") - err = ncmpi_put_varm_$1_all(ncid, vid_$1, NULL, count, stride, imap, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "put_varm_$1_all start=NULL") + err = ncmpi_put_varm_$1$2(ncid, vid_$1, NULL, count, stride, imap, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "put_varm_$1$2 start=NULL") - err = ncmpi_put_varm_$1_all(ncid, vid_$1, start, NULL, stride, imap, $1_buf); - EXP_ERR_MSG(NC_EEDGE, "put_varm_$1_all count=NULL") + err = ncmpi_put_varm_$1$2(ncid, vid_$1, start, NULL, stride, imap, $1_buf); + EXP_ERR_MSG(NC_EEDGE, "put_varm_$1$2 count=NULL") - err = ncmpi_put_varm_$1_all(ncid, vid_$1, start, count, NULL, imap, $1_buf); - EXP_ERR_MSG(NC_NOERR, "put_varm_$1_all stride=NULL") + err = ncmpi_put_varm_$1$2(ncid, vid_$1, start, count, NULL, imap, $1_buf); + EXP_ERR_MSG(NC_NOERR, "put_varm_$1$2 stride=NULL") - err = ncmpi_put_varm_$1_all(ncid, vid_$1, start, count, stride, NULL, $1_buf); - EXP_ERR_MSG(NC_NOERR, "put_varm_$1_all imap=NULL") + err = ncmpi_put_varm_$1$2(ncid, vid_$1, start, count, stride, NULL, $1_buf); + EXP_ERR_MSG(NC_NOERR, "put_varm_$1$2 imap=NULL") - err = ncmpi_put_varm_$1_all(ncid, vid_$1, NULL, NULL, stride, imap, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "put_varm_$1_all start=count=NULL") + err = ncmpi_put_varm_$1$2(ncid, vid_$1, NULL, NULL, stride, imap, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "put_varm_$1$2 start=count=NULL") - err = ncmpi_put_varm_$1_all(ncid, vid_$1, NULL, count, NULL, imap, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "put_varm_$1_all start=stride=NULL") + err = ncmpi_put_varm_$1$2(ncid, vid_$1, NULL, count, NULL, imap, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "put_varm_$1$2 start=stride=NULL") - err = ncmpi_put_varm_$1_all(ncid, vid_$1, NULL, count, stride, NULL, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "put_varm_$1_all start=imap=NULL") + err = ncmpi_put_varm_$1$2(ncid, vid_$1, NULL, count, stride, NULL, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "put_varm_$1$2 start=imap=NULL") - err = ncmpi_put_varm_$1_all(ncid, vid_$1, start, NULL, NULL, imap, $1_buf); - EXP_ERR_MSG(NC_EEDGE, "put_varm_$1_all count=stride=NULL") + err = ncmpi_put_varm_$1$2(ncid, vid_$1, start, NULL, NULL, imap, $1_buf); + EXP_ERR_MSG(NC_EEDGE, "put_varm_$1$2 count=stride=NULL") - err = ncmpi_put_varm_$1_all(ncid, vid_$1, start, NULL, stride, NULL, $1_buf); - EXP_ERR_MSG(NC_EEDGE, "put_varm_$1_all count=imap=NULL") + err = ncmpi_put_varm_$1$2(ncid, vid_$1, start, NULL, stride, NULL, $1_buf); + EXP_ERR_MSG(NC_EEDGE, "put_varm_$1$2 count=imap=NULL") - err = ncmpi_put_varm_$1_all(ncid, vid_$1, start, count, NULL, NULL, $1_buf); - EXP_ERR_MSG(NC_NOERR, "put_varm_$1_all stride=imap=NULL") + err = ncmpi_put_varm_$1$2(ncid, vid_$1, start, count, NULL, NULL, $1_buf); + EXP_ERR_MSG(NC_NOERR, "put_varm_$1$2 stride=imap=NULL") - err = ncmpi_put_varm_$1_all(ncid, vid_$1, NULL, NULL, NULL, imap, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "put_varm_$1_all start=count=stride=NULL") + err = ncmpi_put_varm_$1$2(ncid, vid_$1, NULL, NULL, NULL, imap, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "put_varm_$1$2 start=count=stride=NULL") - err = ncmpi_put_varm_$1_all(ncid, vid_$1, NULL, NULL, stride, NULL, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "put_varm_$1_all start=count=imap=NULL") + err = ncmpi_put_varm_$1$2(ncid, vid_$1, NULL, NULL, stride, NULL, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "put_varm_$1$2 start=count=imap=NULL") - err = ncmpi_put_varm_$1_all(ncid, vid_$1, start, NULL, NULL, NULL, $1_buf); - EXP_ERR_MSG(NC_EEDGE, "put_varm_$1_all count=stride=imap=NULL") + err = ncmpi_put_varm_$1$2(ncid, vid_$1, start, NULL, NULL, NULL, $1_buf); + EXP_ERR_MSG(NC_EEDGE, "put_varm_$1$2 count=stride=imap=NULL") - err = ncmpi_put_varm_$1_all(ncid, vid_$1, NULL, NULL, NULL, NULL, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "put_varm_$1_all start=count=stride=imap=NULL") + err = ncmpi_put_varm_$1$2(ncid, vid_$1, NULL, NULL, NULL, NULL, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "put_varm_$1$2 start=count=stride=imap=NULL") + + /* file sync before reading */ + err = ncmpi_sync(ncid); + CHECK_ERR + MPI_Barrier(MPI_COMM_WORLD); /*---- test get_var1 ---- */ - err = ncmpi_get_var1_$1_all(ncid, vid_$1, start, $1_buf); - EXP_ERR_MSG(NC_NOERR, "get_var1_$1_all") + err = ncmpi_get_var1_$1$2(ncid, vid_$1, start, $1_buf); + EXP_ERR_MSG(NC_NOERR, "get_var1_$1$2") - err = ncmpi_get_var1_$1_all(ncid, vid_$1, NULL, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "get_var1_$1_all start=NULL") + err = ncmpi_get_var1_$1$2(ncid, vid_$1, NULL, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "get_var1_$1$2 start=NULL") /*---- test get_vara ---- */ - err = ncmpi_get_vara_$1_all(ncid, vid_$1, start, count, $1_buf); - EXP_ERR_MSG(NC_NOERR, "get_vara_$1_all") + err = ncmpi_get_vara_$1$2(ncid, vid_$1, start, count, $1_buf); + EXP_ERR_MSG(NC_NOERR, "get_vara_$1$2") - err = ncmpi_get_vara_$1_all(ncid, vid_$1, NULL, count, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "get_vara_$1_all start=NULL") + err = ncmpi_get_vara_$1$2(ncid, vid_$1, NULL, count, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "get_vara_$1$2 start=NULL") - err = ncmpi_get_vara_$1_all(ncid, vid_$1, start, NULL, $1_buf); - EXP_ERR_MSG(NC_EEDGE, "get_vara_$1_all count=NULL") + err = ncmpi_get_vara_$1$2(ncid, vid_$1, start, NULL, $1_buf); + EXP_ERR_MSG(NC_EEDGE, "get_vara_$1$2 count=NULL") - err = ncmpi_get_vara_$1_all(ncid, vid_$1, NULL, NULL, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "get_vara_$1_all start=count=NULL") + err = ncmpi_get_vara_$1$2(ncid, vid_$1, NULL, NULL, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "get_vara_$1$2 start=count=NULL") /*---- test get_vars ---- */ - err = ncmpi_get_vars_$1_all(ncid, vid_$1, start, count, stride, $1_buf); - EXP_ERR_MSG(NC_NOERR, "get_vars_$1_all") + err = ncmpi_get_vars_$1$2(ncid, vid_$1, start, count, stride, $1_buf); + EXP_ERR_MSG(NC_NOERR, "get_vars_$1$2") - err = ncmpi_get_vars_$1_all(ncid, vid_$1, NULL, count, stride, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "get_vars_$1_all start=NULL") + err = ncmpi_get_vars_$1$2(ncid, vid_$1, NULL, count, stride, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "get_vars_$1$2 start=NULL") - err = ncmpi_get_vars_$1_all(ncid, vid_$1, start, NULL, stride, $1_buf); - EXP_ERR_MSG(NC_EEDGE, "get_vars_$1_all count=NULL") + err = ncmpi_get_vars_$1$2(ncid, vid_$1, start, NULL, stride, $1_buf); + EXP_ERR_MSG(NC_EEDGE, "get_vars_$1$2 count=NULL") - err = ncmpi_get_vars_$1_all(ncid, vid_$1, start, count, NULL, $1_buf); - EXP_ERR_MSG(NC_NOERR, "get_vars_$1_all stride=NULL") + err = ncmpi_get_vars_$1$2(ncid, vid_$1, start, count, NULL, $1_buf); + EXP_ERR_MSG(NC_NOERR, "get_vars_$1$2 stride=NULL") - err = ncmpi_get_vars_$1_all(ncid, vid_$1, NULL, NULL, stride, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "get_vars_$1_all start=count=NULL") + err = ncmpi_get_vars_$1$2(ncid, vid_$1, NULL, NULL, stride, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "get_vars_$1$2 start=count=NULL") - err = ncmpi_get_vars_$1_all(ncid, vid_$1, NULL, count, NULL, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "get_vars_$1_all start=stride=NULL") + err = ncmpi_get_vars_$1$2(ncid, vid_$1, NULL, count, NULL, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "get_vars_$1$2 start=stride=NULL") - err = ncmpi_get_vars_$1_all(ncid, vid_$1, start, NULL, NULL, $1_buf); - EXP_ERR_MSG(NC_EEDGE, "get_vars_$1_all count=stride=NULL") + err = ncmpi_get_vars_$1$2(ncid, vid_$1, start, NULL, NULL, $1_buf); + EXP_ERR_MSG(NC_EEDGE, "get_vars_$1$2 count=stride=NULL") - err = ncmpi_get_vars_$1_all(ncid, vid_$1, NULL, NULL, NULL, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "get_vars_$1_all start=count=stride=NULL") + err = ncmpi_get_vars_$1$2(ncid, vid_$1, NULL, NULL, NULL, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "get_vars_$1$2 start=count=stride=NULL") /*---- test get_varm ---- */ - err = ncmpi_get_varm_$1_all(ncid, vid_$1, start, count, stride, imap, $1_buf); - EXP_ERR_MSG(NC_NOERR, "get_varm_$1_all") + err = ncmpi_get_varm_$1$2(ncid, vid_$1, start, count, stride, imap, $1_buf); + EXP_ERR_MSG(NC_NOERR, "get_varm_$1$2") - err = ncmpi_get_varm_$1_all(ncid, vid_$1, NULL, count, stride, imap, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "get_varm_$1_all start=NULL") + err = ncmpi_get_varm_$1$2(ncid, vid_$1, NULL, count, stride, imap, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "get_varm_$1$2 start=NULL") - err = ncmpi_get_varm_$1_all(ncid, vid_$1, start, NULL, stride, imap, $1_buf); - EXP_ERR_MSG(NC_EEDGE, "get_varm_$1_all count=NULL") + err = ncmpi_get_varm_$1$2(ncid, vid_$1, start, NULL, stride, imap, $1_buf); + EXP_ERR_MSG(NC_EEDGE, "get_varm_$1$2 count=NULL") - err = ncmpi_get_varm_$1_all(ncid, vid_$1, start, count, NULL, imap, $1_buf); - EXP_ERR_MSG(NC_NOERR, "get_varm_$1_all stride=NULL") + err = ncmpi_get_varm_$1$2(ncid, vid_$1, start, count, NULL, imap, $1_buf); + EXP_ERR_MSG(NC_NOERR, "get_varm_$1$2 stride=NULL") - err = ncmpi_get_varm_$1_all(ncid, vid_$1, start, count, stride, NULL, $1_buf); - EXP_ERR_MSG(NC_NOERR, "get_varm_$1_all imap=NULL") + err = ncmpi_get_varm_$1$2(ncid, vid_$1, start, count, stride, NULL, $1_buf); + EXP_ERR_MSG(NC_NOERR, "get_varm_$1$2 imap=NULL") - err = ncmpi_get_varm_$1_all(ncid, vid_$1, NULL, NULL, stride, imap, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "get_varm_$1_all start=count=NULL") + err = ncmpi_get_varm_$1$2(ncid, vid_$1, NULL, NULL, stride, imap, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "get_varm_$1$2 start=count=NULL") - err = ncmpi_get_varm_$1_all(ncid, vid_$1, NULL, count, NULL, imap, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "get_varm_$1_all start=stride=NULL") + err = ncmpi_get_varm_$1$2(ncid, vid_$1, NULL, count, NULL, imap, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "get_varm_$1$2 start=stride=NULL") - err = ncmpi_get_varm_$1_all(ncid, vid_$1, NULL, count, stride, NULL, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "get_varm_$1_all start=imap=NULL") + err = ncmpi_get_varm_$1$2(ncid, vid_$1, NULL, count, stride, NULL, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "get_varm_$1$2 start=imap=NULL") - err = ncmpi_get_varm_$1_all(ncid, vid_$1, start, NULL, NULL, imap, $1_buf); - EXP_ERR_MSG(NC_EEDGE, "get_varm_$1_all count=stride=NULL") + err = ncmpi_get_varm_$1$2(ncid, vid_$1, start, NULL, NULL, imap, $1_buf); + EXP_ERR_MSG(NC_EEDGE, "get_varm_$1$2 count=stride=NULL") - err = ncmpi_get_varm_$1_all(ncid, vid_$1, start, NULL, stride, NULL, $1_buf); - EXP_ERR_MSG(NC_EEDGE, "get_varm_$1_all count=imap=NULL") + err = ncmpi_get_varm_$1$2(ncid, vid_$1, start, NULL, stride, NULL, $1_buf); + EXP_ERR_MSG(NC_EEDGE, "get_varm_$1$2 count=imap=NULL") - err = ncmpi_get_varm_$1_all(ncid, vid_$1, start, count, NULL, NULL, $1_buf); - EXP_ERR_MSG(NC_NOERR, "get_varm_$1_all stride=imap=NULL") + err = ncmpi_get_varm_$1$2(ncid, vid_$1, start, count, NULL, NULL, $1_buf); + EXP_ERR_MSG(NC_NOERR, "get_varm_$1$2 stride=imap=NULL") - err = ncmpi_get_varm_$1_all(ncid, vid_$1, NULL, NULL, NULL, imap, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "get_varm_$1_all start=count=stride=NULL") + err = ncmpi_get_varm_$1$2(ncid, vid_$1, NULL, NULL, NULL, imap, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "get_varm_$1$2 start=count=stride=NULL") - err = ncmpi_get_varm_$1_all(ncid, vid_$1, NULL, NULL, stride, NULL, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "get_varm_$1_all start=count=imap=NULL") + err = ncmpi_get_varm_$1$2(ncid, vid_$1, NULL, NULL, stride, NULL, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "get_varm_$1$2 start=count=imap=NULL") - err = ncmpi_get_varm_$1_all(ncid, vid_$1, start, NULL, NULL, NULL, $1_buf); - EXP_ERR_MSG(NC_EEDGE, "get_varm_$1_all count=stride=imap=NULL") + err = ncmpi_get_varm_$1$2(ncid, vid_$1, start, NULL, NULL, NULL, $1_buf); + EXP_ERR_MSG(NC_EEDGE, "get_varm_$1$2 count=stride=imap=NULL") - err = ncmpi_get_varm_$1_all(ncid, vid_$1, NULL, NULL, NULL, NULL, $1_buf); - EXP_ERR_MSG(NC_EINVALCOORDS, "get_varm_$1_all start=count=stride=imap=NULL") + err = ncmpi_get_varm_$1$2(ncid, vid_$1, NULL, NULL, NULL, NULL, $1_buf); + EXP_ERR_MSG(NC_EINVALCOORDS, "get_varm_$1$2 start=count=stride=imap=NULL") ')dnl define(`CDF5_ITYPES',`schar,uchar,short,ushort,int,uint,long,float,double,longlong,ulonglong')dnl @@ -255,7 +260,7 @@ define(`EXTRA_ITYPES',`uchar,ushort,uint,longlong,ulonglong')dnl define(`TEST_FORMAT',dnl `dnl static int -test_format_nc$1(char *filename) +test_format_nc$1(const char *filename, int format, int coll_io, MPI_Info info) { int err, nerrs=0, ncid, cmode, dimid[2]; MPI_Offset start[2], count[2], stride[2], imap[2]; @@ -284,7 +289,7 @@ test_format_nc$1(char *filename) `$1',`4',`cmode = NC_CLOBBER | NC_NETCDF4 | NC_CLASSIC_MODEL;', `cmode = NC_CLOBBER;')dnl - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); EXP_ERR_MSG(NC_NOERR, "create") err = ncmpi_def_dim(ncid, "Y", NC_UNLIMITED, &dimid[0]); @@ -295,17 +300,36 @@ test_format_nc$1(char *filename) /* define variables */dnl foreach(`itype',(text, TYPE_LIST),`_CAT(` err = ncmpi_def_var(ncid,"var_'itype`",NC_TYPE(itype),2,dimid,&vid_',itype`); - EXP_ERR_MSG(NC_NOERR,"def_var")')') + EXP_ERR_MSG(NC_NOERR,"def_var") + err = ncmpi_def_var_fill(ncid, vid_'itype`, 0, NULL); + EXP_ERR_MSG(NC_NOERR,"def_var_fill")')') err = ncmpi_enddef(ncid); EXP_ERR_MSG(NC_NOERR,"enddef") + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + + if (format != NC_FORMAT_NETCDF4 && format != NC_FORMAT_NETCDF4_CLASSIC) { + /* fill the 1st record of all variables */dnl + foreach(`itype',(text, TYPE_LIST),`_CAT(` + err = ncmpi_fill_var_rec(ncid, vid_'itype`, 0); + EXP_ERR_MSG(NC_NOERR,"fill_var_rec")')') + } + start[0] = start[1] = 0; count[0] = count[1] = 1; stride[0] = stride[1] = 1; imap[0] = imap[1] = 1; - foreach(`itype',(text, TYPE_LIST),`TEST_NULL_ARGS(itype)') + if (coll_io) { + foreach(`itype',(text, TYPE_LIST),`TEST_NULL_ARGS(itype, _all)') + } + else { + foreach(`itype',(text, TYPE_LIST),`TEST_NULL_ARGS(itype)') + } err_out: err = ncmpi_close(ncid); @@ -318,67 +342,65 @@ err_out: TEST_FORMAT(1) TEST_FORMAT(2) TEST_FORMAT(5) -#ifdef ENABLE_NETCDF4 TEST_FORMAT(3) TEST_FORMAT(4) -#endif -int main(int argc, char **argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - char filename[256], *hint_value;; - int err, nerrs=0, rank, bb_enabled=0; + char val[MPI_MAX_INFO_VAL]; + int err, nerrs=0, flag; - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); + /* check whether burst buffering is enabled */ + MPI_Info_get(info, "nc_burst_buf", MPI_MAX_INFO_VAL - 1, val, &flag); + if (flag && strcasecmp(val, "enable") == 0 && + (format == NC_FORMAT_NETCDF4 || format == NC_FORMAT_NETCDF4_CLASSIC)) + /* does not work for NetCDF4 files when burst-buffering is enabled */ + return 0; + + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + if (format == NC_FORMAT_CLASSIC) + nerrs += test_format_nc1(out_path, format, coll_io, info); + else if (format == NC_FORMAT_64BIT_OFFSET) + nerrs += test_format_nc2(out_path, format, coll_io, info); + else if (format == NC_FORMAT_NETCDF4) + nerrs += test_format_nc3(out_path, format, coll_io, info); + else if (format == NC_FORMAT_NETCDF4_CLASSIC) + nerrs += test_format_nc4(out_path, format, coll_io, info); + else if (format == NC_FORMAT_64BIT_OFFSET) + nerrs += test_format_nc5(out_path, format, coll_io, info); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for NULL arguments ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + return nerrs; +} - /* check whether burst buffering is enabled */ - if (inq_env_hint("nc_burst_buf", &hint_value)) { - if (strcasecmp(hint_value, "enable") == 0) bb_enabled = 1; - free(hint_value); - } +int main(int argc, char **argv) { - nerrs += test_format_nc1(filename); - nerrs += test_format_nc2(filename); - if (!bb_enabled) { -#ifdef ENABLE_NETCDF4 - nerrs += test_format_nc3(filename); - nerrs += test_format_nc4(filename); -#endif - } - nerrs += test_format_nc5(filename); - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + int err; + loop_opts opt; - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 0; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 0; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "NULL arguments", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/testcases/one_record.c b/test/testcases/one_record.c index 0af91fec97..51d74d9a2f 100644 --- a/test/testcases/one_record.c +++ b/test/testcases/one_record.c @@ -26,40 +26,30 @@ #define STR_LEN 19 #define NUM_VALS 2 -/*----< main() >------------------------------------------------------------*/ -int main(int argc, char **argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - char filename[256]; - int i, err, nerrs=0, rank, nprocs, cmode; + int i, err, nerrs=0, rank, nprocs; int ncid, dimids[2], varid; char data[NUM_VALS][STR_LEN + 1], data_in[NUM_VALS*STR_LEN]; MPI_Offset start[2]; MPI_Offset count[2]; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for only one record variable ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - strcpy(data[0], "2005-04-11_12:00:00"); /* 19 bytes not a multiply of 4 */ strcpy(data[1], "2005-04-11_13:00:00"); - cmode = NC_CLOBBER; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); CHECK_ERR + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "time", NC_UNLIMITED, dimids); CHECK_ERR err = ncmpi_def_dim(ncid, "text_dim", STR_LEN, &dimids[1]); CHECK_ERR @@ -70,25 +60,45 @@ int main(int argc, char **argv) err = ncmpi_def_var(ncid, "text_var", NC_CHAR, 2, dimids, &varid); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } /* Write some records of var data. */ count[0] = 1; count[1] = STR_LEN; start[0] = 0; start[1] = 0; for (i=0; i 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "only one record variable", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/parallel_run.sh b/test/testcases/parallel_run.sh index 0abc8b12fb..a78da1f240 100755 --- a/test/testcases/parallel_run.sh +++ b/test/testcases/parallel_run.sh @@ -1,27 +1,30 @@ #!/bin/bash # -# Copyright (C) 2018, Northwestern University and Argonne National Laboratory +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. # # Exit immediately if a command exits with a non-zero status. set -e -VALIDATOR=../../src/utils/ncvalidator/ncvalidator -NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff +DRY_RUN=no +VERBOSE=no + +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $MPIRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $MPIRUN $@ + fi +} MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "MPIRUN = ${MPIRUN}" # echo "check_PROGRAMS=${check_PROGRAMS}" -# remove file system type prefix if there is any -OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` - -# let NTHREADS=$1*6-1 -NTHREADS=`expr $1 \* 6 - 1` - -# echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -30,87 +33,38 @@ fi # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS +if test "x$MIMIC_LUSTRE" != x1 ; then + PNETCDF_HINTS="cb_nodes=2" +fi + for i in ${check_PROGRAMS} ; do - for j in ${safe_modes} ; do - for intra_aggr in 0 1 ; do - if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" - else - export PNETCDF_HINTS= - fi - if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" - fi - export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" - - if test "$i" = tst_version ; then - ${MPIRUN} ./tst_version - continue - fi - - if test "$i" = tst_pthread ; then - # each MPI process created 6 threads - ${MPIRUN} ./tst_pthread ${TESTOUTDIR}/tst_pthread.nc - for k in `seq 0 ${NTHREADS}` ; do - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/tst_pthread.nc.$k - rm -f ${OUTDIR}/tst_pthread.nc.$k - done - continue - fi - - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc - - # put_all_kinds and iput_all_kinds output 3 files - if test "$i" = put_all_kinds -o "$i" = iput_all_kinds ; then - for k in 1 2 5 ; do - # echo "--- validating file ${TESTOUTDIR}/$i.nc$k" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc$k - done - else - # echo "--- validating file ${TESTOUTDIR}/$i.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc - fi - # echo "" - - if test "x${ENABLE_BURST_BUFFER}" = x1 ; then - # echo "---- test burst buffering feature" - saved_PNETCDF_HINTS=${PNETCDF_HINTS} - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.bb.nc - export PNETCDF_HINTS=${saved_PNETCDF_HINTS} - - # put_all_kinds and iput_all_kinds output 3 files - if test "$i" = put_all_kinds -o "$i" = iput_all_kinds ; then - for k in 1 2 5 ; do - # echo "--- validating file ${TESTOUTDIR}/$i.bb.nc$k" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.nc$k - # echo "--- ncmpidiff $i.nc$k $i.bb.nc$k ---" - ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.nc$k ${TESTOUTDIR}/$i.bb.nc$k - done - continue - else - # echo "--- validating file ${TESTOUTDIR}/$i.bb.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.nc - fi - - # skip ncmpidiff for large file - if test "$i" = last_large_var ; then - continue - fi - - # echo "--- ncmpidiff $i.nc $i.bb.nc ---" - ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.nc ${TESTOUTDIR}/$i.bb.nc - fi - - if test "x${ENABLE_NETCDF4}" = x1 ; then - # echo "test netCDF-4 feature" - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc4 4 - # Validator does not support nc4 - fi - done - done - rm -f ${OUTDIR}/$i.nc* - rm -f ${OUTDIR}/$i.bb.nc* -done + + # # SECONDS=0 + # start_ns=$(date +%s.%4N) + + exe_name=`basename $i` + + # PNCIO driver does not support vard APIs + if test "x$exe_name" = xtest_vardf90 || test "x$exe_name" = xtest_vardf ; then + export PNETCDF_HINTS="nc_pncio=disable;$PNETCDF_HINTS" + fi + + for j in ${safe_modes} ; do + + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi + + run_cmd ./$i -q -o ${TESTOUTDIR}/${exe_name}.nc + + done # safe_modes + + # # echo "Elapsed: $SECONDS seconds" + # end_ns=$(date +%s.%4N) + # # Calculate difference (requires bc for floating point math) + # elapsed_ns=$(echo "$end_ns - $start_ns" | bc) + # echo "Elapsed time: ${elapsed_ns} seconds" + +done # check_PROGRAMS diff --git a/test/testcases/put_all_kinds.m4 b/test/testcases/put_all_kinds.m4 index 8cbac915d8..7585bbcba6 100644 --- a/test/testcases/put_all_kinds.m4 +++ b/test/testcases/put_all_kinds.m4 @@ -22,7 +22,7 @@ dnl #include #define NDIMS 3 -#define LEN 2 +#define LEN 7 include(`foreach.m4')dnl include(`utils.m4')dnl @@ -47,12 +47,43 @@ include(`utils.m4')dnl #define ulonglong unsigned long long #endif +define(`DEFINE_VARS',dnl +`dnl +static int +def_vars_$1(int rank, + int ncid, + int *dimids) +{ + int err=NC_NOERR; + int var1_id, vara_id, vars_id, varm_id; + int dimid, dimidsT[NDIMS]; + + err = ncmpi_inq_dimid(ncid, "nprocs", &dimid); CHECK_ERR + + err = ncmpi_def_var(ncid, "var1_$1", NC_TYPE($1), 1, &dimid, &var1_id); + CHECK_ERR + err = ncmpi_def_var(ncid, "vara_$1", NC_TYPE($1), NDIMS, dimids, &vara_id); + CHECK_ERR + err = ncmpi_def_var(ncid, "vars_$1", NC_TYPE($1), NDIMS, dimids, &vars_id); + CHECK_ERR + + /* define variable with transposed file layout: ZYX -> YXZ */ + dimidsT[0] = dimids[1]; dimidsT[1] = dimids[2]; dimidsT[2] = dimids[0]; + err = ncmpi_def_var(ncid, "varm_$1", NC_TYPE($1), NDIMS, dimidsT, &varm_id); + CHECK_ERR + + return err; +} +')dnl + +foreach(`itype',(`text,schar,uchar,short,ushort,int,uint,long,float,double,longlong,ulonglong'),`DEFINE_VARS(itype)') + define(`TEST_BLOCKING_PUT',dnl `dnl static int blocking_put_$1(int rank, + int coll_io, int ncid, - int *dimids, MPI_Offset *start, MPI_Offset *count, MPI_Offset *startS, @@ -63,34 +94,40 @@ blocking_put_$1(int rank, MPI_Offset *imap, ifelse(`$1',`text',`char',`double') *buf) { - int err, nerrs=0; + int err=NC_NOERR, nerrs=0; int var1_id, vara_id, vars_id, varm_id; - int dimid, dimidsT[NDIMS]; MPI_Offset start1[1]; - /* re-enter define mode, so we can add more variables */ - err = ncmpi_redef(ncid); CHECK_ERR - err = ncmpi_inq_dimid(ncid, "nprocs", &dimid); CHECK_ERR - err = ncmpi_def_var(ncid, "var1_$1", NC_TYPE($1), 1, &dimid, &var1_id); CHECK_ERR - err = ncmpi_def_var(ncid, "vara_$1", NC_TYPE($1), NDIMS, dimids, &vara_id); CHECK_ERR - err = ncmpi_def_var(ncid, "vars_$1", NC_TYPE($1), NDIMS, dimids, &vars_id); CHECK_ERR - - /* define variable with transposed file layout: ZYX -> YXZ */ - dimidsT[0] = dimids[1]; dimidsT[1] = dimids[2]; dimidsT[2] = dimids[0]; - err = ncmpi_def_var(ncid, "varm_$1", NC_TYPE($1), NDIMS, dimidsT, &varm_id); CHECK_ERR - - /* exit the define mode */ - err = ncmpi_enddef(ncid); CHECK_ERR + err = ncmpi_inq_varid(ncid, "var1_$1", &var1_id); CHECK_ERR + err = ncmpi_inq_varid(ncid, "vara_$1", &vara_id); CHECK_ERR + err = ncmpi_inq_varid(ncid, "vars_$1", &vars_id); CHECK_ERR + err = ncmpi_inq_varid(ncid, "varm_$1", &varm_id); CHECK_ERR /* write the whole variable in parallel */ start1[0] = rank; - err = `ncmpi_put_var1_'ifelse(`$1',`text',`$1',`double')`_all'(ncid, var1_id, start1, buf); CHECK_ERR + if (coll_io) + err = `ncmpi_put_var1_'ifelse(`$1',`text',`$1',`double')`_all'(ncid, var1_id, start1, buf); + else + err = `ncmpi_put_var1_'ifelse(`$1',`text',`$1',`double')(ncid, var1_id, start1, buf); + CHECK_ERR - err = `ncmpi_put_vara_'ifelse(`$1',`text',`$1',`double')`_all'(ncid, vara_id, start, count, buf); CHECK_ERR + if (coll_io) + err = `ncmpi_put_vara_'ifelse(`$1',`text',`$1',`double')`_all'(ncid, vara_id, start, count, buf); + else + err = `ncmpi_put_vara_'ifelse(`$1',`text',`$1',`double')(ncid, vara_id, start, count, buf); + CHECK_ERR - err = `ncmpi_put_vars_'ifelse(`$1',`text',`$1',`double')`_all'(ncid, vars_id, startS, countS, stride, buf); CHECK_ERR + if (coll_io) + err = `ncmpi_put_vars_'ifelse(`$1',`text',`$1',`double')`_all'(ncid, vars_id, startS, countS, stride, buf); + else + err = `ncmpi_put_vars_'ifelse(`$1',`text',`$1',`double')(ncid, vars_id, startS, countS, stride, buf); + CHECK_ERR - err = `ncmpi_put_varm_'ifelse(`$1',`text',`$1',`double')`_all'(ncid, varm_id, startM, countM, NULL, imap, buf); CHECK_ERR + if (coll_io) + err = `ncmpi_put_varm_'ifelse(`$1',`text',`$1',`double')`_all'(ncid, varm_id, startM, countM, NULL, imap, buf); + else + err = `ncmpi_put_varm_'ifelse(`$1',`text',`$1',`double')(ncid, varm_id, startM, countM, NULL, imap, buf); + CHECK_ERR return nerrs; } @@ -100,20 +137,12 @@ foreach(`itype',(`text,schar,uchar,short,ushort,int,uint,long,float,double,longl define(`TEST_CDF_FORMAT',dnl `dnl -/* create a new $1 file */ - cmode = NC_CLOBBER; - ifelse(`$1',`NC_FORMAT_64BIT_OFFSET', `cmode |= NC_64BIT_OFFSET;', - `$1',`NC_FORMAT_64BIT_DATA', `cmode |= NC_64BIT_DATA;', - `$1',`NC_FORMAT_NETCDF4_CLASSIC',`cmode |= NC_NETCDF4 | NC_CLASSIC_MODEL;', - `$1',`NC_FORMAT_NETCDF4', `cmode |= NC_NETCDF4;') - - sprintf(fname, "%s%d",filename, $1); - err = ncmpi_create(MPI_COMM_WORLD, fname, cmode, info, &ncid); + /* create a new file */ + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); if (err != NC_NOERR) { printf("Error at line %d in %s: ncmpi_create() file %s (%s)\n", __LINE__,__FILE__,fname,ncmpi_strerror(err)); - MPI_Abort(MPI_COMM_WORLD, -1); - exit(1); + CHECK_ERR } /* define dimensions */ @@ -121,66 +150,62 @@ define(`TEST_CDF_FORMAT',dnl err = ncmpi_def_dim(ncid, "Z", gsize[0], &dimids[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "Y", gsize[1], &dimids[1]); CHECK_ERR err = ncmpi_def_dim(ncid, "X", gsize[2], &dimids[2]); CHECK_ERR + + nerrs += def_vars_text(rank, ncid, dimids); CHECK_ERR + foreach(`itype',(`schar,short,int,long,float,double'),` + _CAT(`nerrs += def_vars_',itype)'`(rank, ncid, dimids); CHECK_ERR') + + if (format == NC_FORMAT_64BIT_DATA) { + foreach(`itype', + (`uchar,ushort,uint,longlong,ulonglong'),` + _CAT(`nerrs += def_vars_',itype)'`(rank, ncid, dimids); CHECK_ERR') + } + + /* define variables */ err = ncmpi_enddef(ncid); - nerrs += blocking_put_text(rank, ncid, dimids, start, count, + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + + nerrs += blocking_put_text(rank, coll_io, ncid, start, count, startS, countS, stride, startM, countM, imap, cbuf); foreach(`itype',(`schar,short,int,long,float,double'),` - _CAT(`nerrs += blocking_put_',itype)'`(rank, ncid, dimids, start, count, - startS, countS, stride, startM, countM, imap, buf);') - -ifelse(`$1', `NC_FORMAT_CLASSIC',`', - `$1', `NC_FORMAT_64BIT_OFFSET',`', - `$1', `NC_FORMAT_NETCDF4_CLASSIC',`',` - foreach(`itype', - (`uchar,ushort,uint,longlong,ulonglong'),` - _CAT(`nerrs += blocking_put_',itype)'`(rank, ncid, dimids, start, count, - startS, countS, stride, startM, countM, imap, buf);')') + _CAT(`nerrs += blocking_put_',itype)'`(rank, coll_io, ncid, start, count, + startS, countS, stride, startM, countM, imap, buf);') + + if (format == NC_FORMAT_64BIT_DATA) { + foreach(`itype', + (`uchar,ushort,uint,longlong,ulonglong'),` + _CAT(`nerrs += blocking_put_',itype)'`(rank, coll_io, ncid, start, + count, startS, countS, stride, startM, countM, imap, buf);') + } /* close the file */ err = ncmpi_close(ncid); CHECK_ERR ')dnl -/*----< main() >------------------------------------------------------------*/ -int main(int argc, char **argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - char filename[256], fname[512], *hint_value, *cbuf; - int i, j, k, rank, nprocs, ncid, bufsize, err, nerrs=0, cmode; - int bb_enabled=0, psize[NDIMS], dimids[NDIMS], dim_rank[NDIMS]; + char fname[512], *cbuf; + int i, j, k, rank, nprocs, ncid, bufsize, err, nerrs=0; + int psize[NDIMS], dimids[NDIMS], dim_rank[NDIMS]; double *buf; MPI_Offset gsize[NDIMS], stride[NDIMS], imap[NDIMS]; MPI_Offset start[NDIMS], count[NDIMS]; MPI_Offset startS[NDIMS], countS[NDIMS]; MPI_Offset startM[NDIMS], countM[NDIMS]; - MPI_Info info; - MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for all kinds put APIs ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - - /* check whether burst buffering is enabled */ - if (inq_env_hint("nc_burst_buf", &hint_value)) { - if (strcasecmp(hint_value, "enable") == 0) bb_enabled = 1; - free(hint_value); - } - /* calculate number of processes along each dimension */ for (i=0; i 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + return nerrs; +} + +int main(int argc, char **argv) { + + int err; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "all kinds put APIs", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/put_parameter.f b/test/testcases/put_parameter.f index 5f3d78da29..c3a4641519 100644 --- a/test/testcases/put_parameter.f +++ b/test/testcases/put_parameter.f @@ -61,8 +61,8 @@ subroutine check(err, message) ! It is a good idea to check returned value for possible error if (err .NE. NF_NOERR) then write(6,*) message(1:XTRIM(message)), nfmpi_strerror(err) - msg = '*** TESTING F77 put_parameter.f for immutable put ' - call pass_fail(1, msg) + msg = '*** TESTING F77 put_parameter.f - immutable put ' + call pass_fail(1, msg, 0) STOP 2 end if end ! subroutine check @@ -76,32 +76,40 @@ program main PARAMETER(NX=4) data buffer /5,6,7,8/ - character(LEN=256) filename, cmd, msg + character(LEN=256) out_path, in_path, cmd, msg integer err, ierr, nprocs, rank, nerrs, get_args, XTRIM integer cmode, ncid, varid(2), dimid(2) integer*8 len_ll, start(2), count(2) integer*8 malloc_size, sum_size + logical keep_files + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any if (rank .EQ. 0) then - filename = "testfile.nc" - err = get_args(cmd, filename) + out_path = "testfile.nc" + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, + MPI_COMM_WORLD, ierr) + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, + + ierr) + nerrs = 0 ! create file, truncate it if exists cmode = IOR(NF_CLOBBER, NF_64BIT_OFFSET) - err = nfmpi_create(MPI_COMM_WORLD, filename, cmode, + err = nfmpi_create(MPI_COMM_WORLD, out_path, cmode, + MPI_INFO_NULL, ncid) call check(err, 'In nfmpi_create: ') @@ -163,10 +171,18 @@ program main + sum_size, ' bytes yet to be freed' endif + timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, + + MPI_DOUBLE_PRECISION, MPI_MAX, + + MPI_COMM_WORLD, ierr) if (rank .eq. 0) then + if (.NOT. keep_files) then + err = nfmpi_delete(out_path, MPI_INFO_NULL) + end if + msg = '*** TESTING F77 '//cmd(1:XTRIM(cmd))// - + ' for using immutable write buf ' - call pass_fail(nerrs, msg) + + ' - using immutable write buf ' + call pass_fail(nerrs, msg, timing) endif 999 call MPI_Finalize(ierr) diff --git a/test/testcases/record.c b/test/testcases/record.c index 170e1bfe39..e756f1240d 100644 --- a/test/testcases/record.c +++ b/test/testcases/record.c @@ -35,15 +35,17 @@ #include static -int test_only_record_var_1D(char *filename, int cmode) +int test_only_record_var_1D(const char *out_path, int format, MPI_Info info) { int ncid, varid, dimid, buf[20], err, nerrs=0; MPI_Offset start[1], count[1], length; - MPI_Info info=MPI_INFO_NULL; + + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR /* create a new file for writing ----------------------------------------*/ - cmode |= NC_CLOBBER; - err = ncmpi_create(MPI_COMM_SELF, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_SELF, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR /* define dimension and variable */ err = ncmpi_def_dim(ncid, "REC_DIM", NC_UNLIMITED, &dimid); CHECK_ERR @@ -67,7 +69,10 @@ int test_only_record_var_1D(char *filename, int cmode) nerrs++; } - if (nerrs == 0 && !(cmode & NC_NETCDF4)) { /* test independent data mode */ + /* test independent data mode */ + if (nerrs == 0 && format != NC_FORMAT_NETCDF4 && + format != NC_FORMAT_NETCDF4_CLASSIC) { + err = ncmpi_begin_indep_data(ncid); CHECK_ERR /* write the 4th record */ buf[0] = 93; @@ -92,15 +97,17 @@ int test_only_record_var_1D(char *filename, int cmode) } static -int test_only_record_var_3D(char *filename, int cmode) +int test_only_record_var_3D(const char *out_path, int format, MPI_Info info) { int i, ncid, varid, dimid[3], buf[20], err, nerrs=0; MPI_Offset start[3], count[3], length; - MPI_Info info=MPI_INFO_NULL; + + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR /* create a new file for writing ----------------------------------------*/ - cmode |= NC_CLOBBER; - err = ncmpi_create(MPI_COMM_SELF, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_SELF, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR /* define dimension and variable */ err = ncmpi_def_dim(ncid, "REC_DIM", NC_UNLIMITED, &dimid[0]); CHECK_ERR @@ -128,7 +135,10 @@ int test_only_record_var_3D(char *filename, int cmode) nerrs++; } - if (nerrs == 0 && !(cmode & NC_NETCDF4)) { /* test independent data mode */ + /* test independent data mode */ + if (nerrs == 0 && format != NC_FORMAT_NETCDF4 && + format != NC_FORMAT_NETCDF4_CLASSIC) { + err = ncmpi_begin_indep_data(ncid); CHECK_ERR /* write the 4th record */ for (i=0; i<20; i++) buf[i] = 93; @@ -153,15 +163,17 @@ int test_only_record_var_3D(char *filename, int cmode) } static -int test_two_record_var(char *filename, int cmode) +int test_two_record_var(const char *out_path, int format, MPI_Info info) { int i, ncid, varid[2], dimid[3], buf[20], err, nerrs=0; MPI_Offset start[3], count[3], length; - MPI_Info info=MPI_INFO_NULL; + + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR /* create a new file for writing ----------------------------------------*/ - cmode |= NC_CLOBBER; - err = ncmpi_create(MPI_COMM_SELF, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_SELF, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR /* define dimension and variable */ err = ncmpi_def_dim(ncid, "REC_DIM", NC_UNLIMITED, &dimid[0]); CHECK_ERR @@ -190,14 +202,14 @@ int test_two_record_var(char *filename, int cmode) if (nerrs == 0) { /* test independent data mode */ /* writing new records, HDF5 requires collective I/O */ - if (!(cmode & NC_NETCDF4)) { + if (format != NC_FORMAT_NETCDF4 && format != NC_FORMAT_NETCDF4_CLASSIC) { err = ncmpi_begin_indep_data(ncid); CHECK_ERR } /* write the 4th record */ buf[0] = 93; start[0] = 3; count[0] = 1; - if (!(cmode & NC_NETCDF4)) { + if (format != NC_FORMAT_NETCDF4 && format != NC_FORMAT_NETCDF4_CLASSIC) { err = ncmpi_put_vara_int(ncid, varid[0], start, count, buf); CHECK_ERR } else { err = ncmpi_put_vara_int_all(ncid, varid[0], start, count, buf); CHECK_ERR @@ -206,7 +218,7 @@ int test_two_record_var(char *filename, int cmode) /* write the 3rd and 4th records */ buf[0] = 92; buf[1] = 93; start[0] = 2; count[0] = 2; - if (!(cmode & NC_NETCDF4)) { + if (format != NC_FORMAT_NETCDF4 && format != NC_FORMAT_NETCDF4_CLASSIC) { err = ncmpi_put_vara_int(ncid, varid[0], start, count, buf); CHECK_ERR } else { err = ncmpi_put_vara_int_all(ncid, varid[0], start, count, buf); CHECK_ERR @@ -218,7 +230,7 @@ int test_two_record_var(char *filename, int cmode) __LINE__,__FILE__,length); nerrs++; } - if (!(cmode & NC_NETCDF4)) { + if (format != NC_FORMAT_NETCDF4 && format != NC_FORMAT_NETCDF4_CLASSIC) { err = ncmpi_end_indep_data(ncid); CHECK_ERR } } @@ -244,14 +256,14 @@ int test_two_record_var(char *filename, int cmode) if (nerrs == 0) { /* test independent data mode */ /* writing new records, HDF5 requires collective I/O */ - if (!(cmode & NC_NETCDF4)) { + if (format != NC_FORMAT_NETCDF4 && format != NC_FORMAT_NETCDF4_CLASSIC) { err = ncmpi_begin_indep_data(ncid); CHECK_ERR } /* write the 4th record */ for (i=0; i<20; i++) buf[i] = 93; start[0] = 3; - if (!(cmode & NC_NETCDF4)) { + if (format != NC_FORMAT_NETCDF4 && format != NC_FORMAT_NETCDF4_CLASSIC) { err = ncmpi_put_vara_int(ncid, varid[1], start, count, buf); CHECK_ERR } else { err = ncmpi_put_vara_int_all(ncid, varid[1], start, count, buf); CHECK_ERR @@ -260,7 +272,7 @@ int test_two_record_var(char *filename, int cmode) /* write the 3rd record */ for (i=0; i<20; i++) buf[i] = 92; start[0] = 2; - if (!(cmode & NC_NETCDF4)) { + if (format != NC_FORMAT_NETCDF4 && format != NC_FORMAT_NETCDF4_CLASSIC) { err = ncmpi_put_vara_int(ncid, varid[1], start, count, buf); CHECK_ERR } else { err = ncmpi_put_vara_int_all(ncid, varid[1], start, count, buf); CHECK_ERR @@ -272,7 +284,7 @@ int test_two_record_var(char *filename, int cmode) __LINE__,__FILE__,length); nerrs++; } - if (!(cmode & NC_NETCDF4)) { + if (format != NC_FORMAT_NETCDF4 && format != NC_FORMAT_NETCDF4_CLASSIC) { err = ncmpi_end_indep_data(ncid); CHECK_ERR } } @@ -280,92 +292,57 @@ int test_two_record_var(char *filename, int cmode) return nerrs; } -int main(int argc, char** argv) { - char filename[256], *hint_value; - int nerrs=0, rank, nprocs, err, bb_enabled=0; +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) +{ + char val[MPI_MAX_INFO_VAL]; + int err=0, rank, flag; + + /* check whether burst buffering is enabled */ + MPI_Info_get(info, "nc_burst_buf", MPI_MAX_INFO_VAL - 1, val, &flag); + if (flag && strcasecmp(val, "enable") == 0 && + (format == NC_FORMAT_NETCDF4 || format == NC_FORMAT_NETCDF4_CLASSIC)) + /* does not work for NetCDF4 files when burst-buffering is enabled */ + return 0; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + if (rank >= 1) return 0; /* this test is for running 1 process */ - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - goto fn_exit; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for write records in reversed order", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + err = test_only_record_var_1D(out_path, format, info); + if (err > 0) return err; + err = test_only_record_var_3D(out_path, format, info); + if (err > 0) return err; + err = test_two_record_var(out_path, format, info); + if (err > 0) return err; - if (rank >= 1) goto fn_exit; /* this test is for running 1 process */ + return err; +} - /* check whether burst buffering is enabled */ - if (inq_env_hint("nc_burst_buf", &hint_value)) { - if (strcasecmp(hint_value, "enable") == 0) bb_enabled = 1; - free(hint_value); - } +int main(int argc, char **argv) { - /* CDF-1: test only one 1D record variable */ - nerrs += test_only_record_var_1D(filename, 0); - /* CDF-1: test only one 3D record variable */ - nerrs += test_only_record_var_3D(filename, 0); - /* CDF-1: test two record variables */ - nerrs += test_two_record_var(filename, 0); - - /* CDF-2: test only one 1D record variable */ - nerrs += test_only_record_var_1D(filename, NC_64BIT_OFFSET); - /* CDF-2: test only one 3D record variable */ - nerrs += test_only_record_var_3D(filename, NC_64BIT_OFFSET); - /* CDF-2: test two record variables */ - nerrs += test_two_record_var(filename, NC_64BIT_OFFSET); - - if (!bb_enabled) { -#ifdef USE_NETCDF4 - /* NETCDF4: test only one 1D record variable */ - nerrs += test_only_record_var_1D(filename, NC_NETCDF4); - /* NETCDF4: test only one 3D record variable */ - nerrs += test_only_record_var_3D(filename, NC_NETCDF4); - /* NETCDF4: test two record variables */ - nerrs += test_two_record_var(filename, NC_NETCDF4); - - /* NETCDF4_CLASSIC: test only one 1D record variable */ - nerrs += test_only_record_var_1D(filename, NC_NETCDF4|NC_CLASSIC_MODEL); - /* NETCDF4_CLASSIC: test only one 3D record variable */ - nerrs += test_only_record_var_3D(filename, NC_NETCDF4|NC_CLASSIC_MODEL); - /* NETCDF4_CLASSIC: test two record variables */ - nerrs += test_two_record_var(filename, NC_NETCDF4|NC_CLASSIC_MODEL); -#endif - } + int err; + loop_opts opt; - /* CDF-5: test only one 1D record variable */ - nerrs += test_only_record_var_1D(filename, NC_64BIT_DATA); - /* CDF-5: test only one 3D record variable */ - nerrs += test_only_record_var_3D(filename, NC_64BIT_DATA); - /* CDF-5: test two record variables */ - nerrs += test_two_record_var(filename, NC_64BIT_DATA); - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR && malloc_size > 0) { /* this test is for running 1 process */ - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - malloc_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + MPI_Init(&argc, &argv); -fn_exit: - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "write records in reversed order", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/redef1.c b/test/testcases/redef1.c index 21774f4267..e5ec4c2fa5 100644 --- a/test/testcases/redef1.c +++ b/test/testcases/redef1.c @@ -26,10 +26,14 @@ #include -static int -tst_fmt(char *filename, int cmode) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - int i, j, k, rank, ncid, err, nerrs=0; + int i, j, k, nprocs, rank, ncid, err, nerrs=0; int dim0id, dim1id, dim5id, dim9id, dim2id, dimsid[2], dims2id[2]; int varid, var3id, var4id, var2id; int *data; @@ -38,11 +42,21 @@ tst_fmt(char *filename, int cmode) MPI_Offset start[2], count[2]; MPI_Comm comm = MPI_COMM_WORLD; - MPI_Comm_rank(comm, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + +#ifdef DEBUG + if (nprocs > 1 && rank == 0) + printf("Warning: %s is designed to run on 1 process\n", + basename(__FILE__)); +#endif + + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR /* Test NetCDF 4 first as ncvalidator checks only classic formats */ - cmode |= NC_CLOBBER; - err = ncmpi_create(comm, filename, cmode, MPI_INFO_NULL, &ncid); CHECK_ERR + err = ncmpi_create(comm, out_path, NC_CLOBBER, MPI_INFO_NULL, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "dim0", len0, &dim0id); CHECK_ERR err = ncmpi_def_dim(ncid, "dim1", len1, &dim1id); CHECK_ERR @@ -63,6 +77,11 @@ tst_fmt(char *filename, int cmode) err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* put data */ start[0] = 0; start[1] = 0; @@ -75,7 +94,10 @@ tst_fmt(char *filename, int cmode) for (j=0; j 0) count[0] = count[1] = 0; - err = ncmpi_put_vara_int_all(ncid, varid, start, count, &data[0]); + if (coll_io) + err = ncmpi_put_vara_int_all(ncid, varid, start, count, &data[0]); + else + err = ncmpi_put_vara_int(ncid, varid, start, count, &data[0]); CHECK_ERR free(data); @@ -87,7 +109,10 @@ tst_fmt(char *filename, int cmode) for (j=0; j 0) count[0] = count[1] = 0; - err = ncmpi_put_vara_int_all(ncid, var3id, start, count, &data[0]); + if (coll_io) + err = ncmpi_put_vara_int_all(ncid, var3id, start, count, &data[0]); + else + err = ncmpi_put_vara_int(ncid, var3id, start, count, &data[0]); CHECK_ERR free(data); @@ -99,13 +124,21 @@ tst_fmt(char *filename, int cmode) for (j=0; j 0) count[0] = count[1] = 0; - err = ncmpi_put_vara_int_all(ncid, var4id, start, count, &data[0]); + if (coll_io) + err = ncmpi_put_vara_int_all(ncid, var4id, start, count, &data[0]); + else + err = ncmpi_put_vara_int(ncid, var4id, start, count, &data[0]); CHECK_ERR free(data); + /* file sync before file close and re-open it */ + err = ncmpi_sync(ncid); + CHECK_ERR + MPI_Barrier(MPI_COMM_WORLD); + err = ncmpi_close(ncid); CHECK_ERR - err = ncmpi_open(comm, filename, NC_WRITE, MPI_INFO_NULL, &ncid); + err = ncmpi_open(comm, out_path, NC_WRITE, MPI_INFO_NULL, &ncid); CHECK_ERR err = ncmpi_redef(ncid); CHECK_ERR @@ -119,6 +152,11 @@ tst_fmt(char *filename, int cmode) err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + start[0] = 0; start[1] = 0; count[0] = len0; @@ -131,7 +169,10 @@ tst_fmt(char *filename, int cmode) k++; } if (rank > 0) count[0] = count[1] = 0; - err = ncmpi_put_vara_double_all(ncid, var2id, start, count, &dbl_data[0]); + if (coll_io) + err = ncmpi_put_vara_double_all(ncid, var2id, start, count, &dbl_data[0]); + else + err = ncmpi_put_vara_double(ncid, var2id, start, count, &dbl_data[0]); CHECK_ERR free(dbl_data); @@ -140,69 +181,27 @@ tst_fmt(char *filename, int cmode) return nerrs; } -int main(int argc, char** argv) -{ - char filename[256], *hint_value; - int commsize, rank, err, nerrs=0, bb_enabled=0; - MPI_Comm comm = MPI_COMM_WORLD; +int main(int argc, char **argv) { - MPI_Init(&argc, &argv); - MPI_Comm_size(comm, &commsize); - MPI_Comm_rank(comm, &rank); + int err; + loop_opts opt; - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "redef2.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for entering re-define mode ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - -#ifdef DEBUG - if (commsize > 1 && rank == 0) - printf("Warning: %s is designed to run on 1 process\n",argv[0]); -#endif - - /* check whether burst buffering is enabled */ - if (inq_env_hint("nc_burst_buf", &hint_value)) { - if (strcasecmp(hint_value, "enable") == 0) bb_enabled = 1; - free(hint_value); - } + MPI_Init(&argc, &argv); - nerrs += tst_fmt(filename, 0); - nerrs += tst_fmt(filename, NC_64BIT_OFFSET); - if (!bb_enabled) { -#ifdef ENABLE_NETCDF4 - nerrs += tst_fmt(filename, NC_NETCDF4); - nerrs += tst_fmt(filename, NC_NETCDF4 | NC_CLASSIC_MODEL); -#endif - } - nerrs += tst_fmt(filename, NC_64BIT_DATA); - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 1; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + err = tst_main(argc, argv, "entering re-define mode", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/testcases/scalar.c b/test/testcases/scalar.c index 098b6f5801..893d8f5b85 100644 --- a/test/testcases/scalar.c +++ b/test/testcases/scalar.c @@ -20,19 +20,27 @@ static int -tst_fmt(char *filename, int cmode) +tst_fmt(const char *out_path, int format, int coll_io, MPI_Info info) { - int err, nerrs=0, ncid, varid, buf; + int err, ncid, varid, buf; MPI_Offset start[1], count[1], stride[1], imap[1]; - /* Test NetCDF-4 first as ncvalidator checks only classic formats */ - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER | cmode, - MPI_INFO_NULL, &ncid); CHECK_ERR + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR /* define a scalar variable of integer type */ err = ncmpi_def_var(ncid, "scalar_var", NC_INT, 0, NULL, &varid); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + buf = 1; start[0] = 1; count[0] = 2; @@ -40,71 +48,133 @@ tst_fmt(char *filename, int cmode) imap[0] = 2; /* put */ - err = ncmpi_put_var1_int_all(ncid, varid, NULL, &buf); CHECK_ERR - err = ncmpi_put_var1_int_all(ncid, varid, start, &buf); CHECK_ERR - - err = ncmpi_put_vara_int_all(ncid, varid, start, count, &buf); CHECK_ERR - err = ncmpi_put_vara_int_all(ncid, varid, NULL, count, &buf); CHECK_ERR - err = ncmpi_put_vara_int_all(ncid, varid, start, NULL, &buf); CHECK_ERR - err = ncmpi_put_vara_int_all(ncid, varid, NULL, NULL, &buf); CHECK_ERR - - err = ncmpi_put_vars_int_all(ncid, varid, start, count, stride, &buf); CHECK_ERR - err = ncmpi_put_vars_int_all(ncid, varid, NULL, count, stride, &buf); CHECK_ERR - err = ncmpi_put_vars_int_all(ncid, varid, start, NULL, stride, &buf); CHECK_ERR - err = ncmpi_put_vars_int_all(ncid, varid, start, count, NULL, &buf); CHECK_ERR - err = ncmpi_put_vars_int_all(ncid, varid, NULL, NULL, NULL, &buf); CHECK_ERR + if (coll_io) { + err = ncmpi_put_var1_int_all(ncid, varid, NULL, &buf); CHECK_ERR + err = ncmpi_put_var1_int_all(ncid, varid, start, &buf); CHECK_ERR + + err = ncmpi_put_vara_int_all(ncid, varid, start, count, &buf); CHECK_ERR + err = ncmpi_put_vara_int_all(ncid, varid, NULL, count, &buf); CHECK_ERR + err = ncmpi_put_vara_int_all(ncid, varid, start, NULL, &buf); CHECK_ERR + err = ncmpi_put_vara_int_all(ncid, varid, NULL, NULL, &buf); CHECK_ERR + + err = ncmpi_put_vars_int_all(ncid, varid, start, count, stride, &buf); CHECK_ERR + err = ncmpi_put_vars_int_all(ncid, varid, NULL, count, stride, &buf); CHECK_ERR + err = ncmpi_put_vars_int_all(ncid, varid, start, NULL, stride, &buf); CHECK_ERR + err = ncmpi_put_vars_int_all(ncid, varid, start, count, NULL, &buf); CHECK_ERR + err = ncmpi_put_vars_int_all(ncid, varid, NULL, NULL, NULL, &buf); CHECK_ERR + + err = ncmpi_put_varm_int_all(ncid, varid, start, count, stride, imap, &buf); CHECK_ERR + err = ncmpi_put_varm_int_all(ncid, varid, NULL, NULL, NULL, NULL, &buf); CHECK_ERR + } + else { + err = ncmpi_put_var1_int(ncid, varid, NULL, &buf); CHECK_ERR + err = ncmpi_put_var1_int(ncid, varid, start, &buf); CHECK_ERR + + err = ncmpi_put_vara_int(ncid, varid, start, count, &buf); CHECK_ERR + err = ncmpi_put_vara_int(ncid, varid, NULL, count, &buf); CHECK_ERR + err = ncmpi_put_vara_int(ncid, varid, start, NULL, &buf); CHECK_ERR + err = ncmpi_put_vara_int(ncid, varid, NULL, NULL, &buf); CHECK_ERR + + err = ncmpi_put_vars_int(ncid, varid, start, count, stride, &buf); CHECK_ERR + err = ncmpi_put_vars_int(ncid, varid, NULL, count, stride, &buf); CHECK_ERR + err = ncmpi_put_vars_int(ncid, varid, start, NULL, stride, &buf); CHECK_ERR + err = ncmpi_put_vars_int(ncid, varid, start, count, NULL, &buf); CHECK_ERR + err = ncmpi_put_vars_int(ncid, varid, NULL, NULL, NULL, &buf); CHECK_ERR + + err = ncmpi_put_varm_int(ncid, varid, start, count, stride, imap, &buf); CHECK_ERR + err = ncmpi_put_varm_int(ncid, varid, NULL, NULL, NULL, NULL, &buf); CHECK_ERR + } - err = ncmpi_put_varm_int_all(ncid, varid, start, count, stride, imap, &buf); CHECK_ERR - err = ncmpi_put_varm_int_all(ncid, varid, NULL, NULL, NULL, NULL, &buf); CHECK_ERR + /* file sync before reading */ + err = ncmpi_sync(ncid); + CHECK_ERR + MPI_Barrier(MPI_COMM_WORLD); err = ncmpi_close(ncid); CHECK_ERR - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid); CHECK_ERR + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR + + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } err = ncmpi_inq_varid(ncid, "scalar_var", &varid); CHECK_ERR /* get */ - err = ncmpi_get_var1_int_all(ncid, varid, NULL, &buf); CHECK_ERR - err = ncmpi_get_var1_int_all(ncid, varid, start, &buf); CHECK_ERR - - err = ncmpi_get_vara_int_all(ncid, varid, start, count, &buf); CHECK_ERR - err = ncmpi_get_vara_int_all(ncid, varid, NULL, count, &buf); CHECK_ERR - err = ncmpi_get_vara_int_all(ncid, varid, start, NULL, &buf); CHECK_ERR - err = ncmpi_get_vara_int_all(ncid, varid, NULL, NULL, &buf); CHECK_ERR - - err = ncmpi_get_vars_int_all(ncid, varid, start, count, stride, &buf); CHECK_ERR - err = ncmpi_get_vars_int_all(ncid, varid, NULL, count, stride, &buf); CHECK_ERR - err = ncmpi_get_vars_int_all(ncid, varid, start, NULL, stride, &buf); CHECK_ERR - err = ncmpi_get_vars_int_all(ncid, varid, start, count, NULL, &buf); CHECK_ERR - err = ncmpi_get_vars_int_all(ncid, varid, NULL, NULL, NULL, &buf); CHECK_ERR - - err = ncmpi_get_varm_int_all(ncid, varid, start, count, stride, imap, &buf); CHECK_ERR - err = ncmpi_get_varm_int_all(ncid, varid, NULL, NULL, NULL, NULL, &buf); CHECK_ERR + if (coll_io) { + err = ncmpi_get_var1_int_all(ncid, varid, NULL, &buf); CHECK_ERR + err = ncmpi_get_var1_int_all(ncid, varid, start, &buf); CHECK_ERR + + err = ncmpi_get_vara_int_all(ncid, varid, start, count, &buf); CHECK_ERR + err = ncmpi_get_vara_int_all(ncid, varid, NULL, count, &buf); CHECK_ERR + err = ncmpi_get_vara_int_all(ncid, varid, start, NULL, &buf); CHECK_ERR + err = ncmpi_get_vara_int_all(ncid, varid, NULL, NULL, &buf); CHECK_ERR + + err = ncmpi_get_vars_int_all(ncid, varid, start, count, stride, &buf); CHECK_ERR + err = ncmpi_get_vars_int_all(ncid, varid, NULL, count, stride, &buf); CHECK_ERR + err = ncmpi_get_vars_int_all(ncid, varid, start, NULL, stride, &buf); CHECK_ERR + err = ncmpi_get_vars_int_all(ncid, varid, start, count, NULL, &buf); CHECK_ERR + err = ncmpi_get_vars_int_all(ncid, varid, NULL, NULL, NULL, &buf); CHECK_ERR + + err = ncmpi_get_varm_int_all(ncid, varid, start, count, stride, imap, &buf); CHECK_ERR + err = ncmpi_get_varm_int_all(ncid, varid, NULL, NULL, NULL, NULL, &buf); CHECK_ERR + } + else { + err = ncmpi_get_var1_int(ncid, varid, NULL, &buf); CHECK_ERR + err = ncmpi_get_var1_int(ncid, varid, start, &buf); CHECK_ERR + + err = ncmpi_get_vara_int(ncid, varid, start, count, &buf); CHECK_ERR + err = ncmpi_get_vara_int(ncid, varid, NULL, count, &buf); CHECK_ERR + err = ncmpi_get_vara_int(ncid, varid, start, NULL, &buf); CHECK_ERR + err = ncmpi_get_vara_int(ncid, varid, NULL, NULL, &buf); CHECK_ERR + + err = ncmpi_get_vars_int(ncid, varid, start, count, stride, &buf); CHECK_ERR + err = ncmpi_get_vars_int(ncid, varid, NULL, count, stride, &buf); CHECK_ERR + err = ncmpi_get_vars_int(ncid, varid, start, NULL, stride, &buf); CHECK_ERR + err = ncmpi_get_vars_int(ncid, varid, start, count, NULL, &buf); CHECK_ERR + err = ncmpi_get_vars_int(ncid, varid, NULL, NULL, NULL, &buf); CHECK_ERR + + err = ncmpi_get_varm_int(ncid, varid, start, count, stride, imap, &buf); CHECK_ERR + err = ncmpi_get_varm_int(ncid, varid, NULL, NULL, NULL, NULL, &buf); CHECK_ERR + } err = ncmpi_close(ncid); CHECK_ERR - return nerrs; + + return 0; } #define WAIT_CHECK { \ CHECK_ERR \ - err = ncmpi_wait_all(ncid, NC_REQ_ALL, NULL, NULL); \ + if (coll_io) \ + err = ncmpi_wait_all(ncid, NC_REQ_ALL, NULL, NULL); \ + else \ + err = ncmpi_wait(ncid, NC_REQ_ALL, NULL, NULL); \ CHECK_ERR \ } static int -tst_fmt_nb(char *filename, int cmode) +tst_fmt_nb(const char *out_path, int format, int coll_io, MPI_Info info) { - int err, nerrs=0, ncid, varid, buf; + int err, ncid, varid, buf; MPI_Offset start[1], count[1], stride[1], imap[1]; - /* No test NetCDF-4 as nonblocking APIs are not defined in NetCDF-4 */ - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER | cmode, - MPI_INFO_NULL, &ncid); CHECK_ERR + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR /* define a scalar variable of integer type */ err = ncmpi_def_var(ncid, "scalar_var", NC_INT, 0, NULL, &varid); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + buf = 1; start[0] = 1; count[0] = 2; @@ -129,9 +199,19 @@ tst_fmt_nb(char *filename, int cmode) err = ncmpi_iput_varm_int(ncid, varid, start, count, stride, imap, &buf, NULL); WAIT_CHECK err = ncmpi_iput_varm_int(ncid, varid, NULL, NULL, NULL, NULL, &buf, NULL); WAIT_CHECK + /* file sync before reading */ + err = ncmpi_sync(ncid); + CHECK_ERR + MPI_Barrier(MPI_COMM_WORLD); + err = ncmpi_close(ncid); CHECK_ERR - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid); CHECK_ERR + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR + + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } err = ncmpi_inq_varid(ncid, "scalar_var", &varid); CHECK_ERR @@ -154,79 +234,66 @@ tst_fmt_nb(char *filename, int cmode) err = ncmpi_iget_varm_int(ncid, varid, NULL, NULL, NULL, NULL, &buf, NULL); WAIT_CHECK err = ncmpi_close(ncid); CHECK_ERR - return nerrs; + + return 0; } -/*----< main() >------------------------------------------------------------*/ -int main(int argc, char **argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - char filename[256], *hint_value; - int err, nerrs=0, rank, nprocs, bb_enabled=0; + char val[MPI_MAX_INFO_VAL]; + int nerrs=0, flag; - MPI_Init(&argc, &argv); +#ifdef DEBUG + int rank, nprocs; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for get/put scalar variables ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - -#ifdef DEBUG if (nprocs > 1 && rank == 0) printf("Warning: %s is designed to run on 1 process\n", argv[0]); #endif /* check whether burst buffering is enabled */ - if (inq_env_hint("nc_burst_buf", &hint_value)) { - if (strcasecmp(hint_value, "enable") == 0) bb_enabled = 1; - free(hint_value); - } + MPI_Info_get(info, "nc_burst_buf", MPI_MAX_INFO_VAL - 1, val, &flag); + if (flag && strcasecmp(val, "enable") == 0 && + (format == NC_FORMAT_NETCDF4 || format == NC_FORMAT_NETCDF4_CLASSIC)) + /* does not work for NetCDF4 files when burst-buffering is enabled */ + return 0; + + /* test blocking APIs */ + nerrs += tst_fmt(out_path, format, coll_io, info); /* test nonblocking APIs */ - nerrs += tst_fmt_nb(filename, 0); - nerrs += tst_fmt_nb(filename, NC_64BIT_OFFSET); - nerrs += tst_fmt_nb(filename, NC_64BIT_DATA); + if (format != NC_FORMAT_NETCDF4 && format != NC_FORMAT_NETCDF4_CLASSIC) + nerrs += tst_fmt_nb(out_path, format, coll_io, info); - /* test blocking APIs */ - nerrs += tst_fmt(filename, 0); - nerrs += tst_fmt(filename, NC_64BIT_OFFSET); - if (!bb_enabled) { -#ifdef ENABLE_NETCDF4 - nerrs += tst_fmt(filename, NC_NETCDF4); - nerrs += tst_fmt(filename, NC_NETCDF4 | NC_CLASSIC_MODEL); -#endif - } - nerrs += tst_fmt(filename, NC_64BIT_DATA); - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "get/put scalar variables", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/seq_runs.sh b/test/testcases/seq_runs.sh index 270536cc6b..6b61670c9c 100755 --- a/test/testcases/seq_runs.sh +++ b/test/testcases/seq_runs.sh @@ -1,80 +1,53 @@ -#!/bin/sh +#!/bin/bash # -# Copyright (C) 2003, Northwestern University and Argonne National Laboratory +# Copyright (C) 2026, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. # # Exit immediately if a command exits with a non-zero status. set -e -VALIDATOR=../../src/utils/ncvalidator/ncvalidator +DRY_RUN=no +VERBOSE=no + +run_cmd() { + local lineno=${BASH_LINENO[$((${#BASH_LINENO[@]} - 2))]} + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line $lineno CMD: $TESTSEQRUN $@" + fi + if test "x$DRY_RUN" = xno ; then + $TESTSEQRUN $@ + fi +} + +if test "x${PNETCDF_DEBUG}" = x1 ; then + safe_modes="0 1" +else + safe_modes="0" +fi + +exe_name=`basename $1` # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS -${TESTSEQRUN} ./tst_version - -${TESTSEQRUN} ./put_all_kinds ${TESTOUTDIR}/put_all_kinds.nc -${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/put_all_kinds.nc1 -${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/put_all_kinds.nc2 -${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/put_all_kinds.nc5 - -${TESTSEQRUN} ./iput_all_kinds ${TESTOUTDIR}/iput_all_kinds.nc -${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/iput_all_kinds.nc1 -${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/iput_all_kinds.nc2 -${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/iput_all_kinds.nc5 - -NCMPIGEN=../../src/utils/ncmpigen/ncmpigen -NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff - -# remove the file system type prefix name if there is any. -OUT_PATH=`echo "$TESTOUTDIR" | cut -d: -f2-` - -rm -f ${OUT_PATH}/testfile.nc ${OUT_PATH}/redef1.nc -${TESTSEQRUN} ${NCMPIGEN} -v 5 -o ${TESTOUTDIR}/redef1.nc ${srcdir}/redef-good.ncdump -${TESTSEQRUN} ./redef1 ${TESTOUTDIR}/testfile.nc -${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/testfile.nc ${TESTOUTDIR}/redef1.nc -# diff -q ${OUT_PATH}/testfile.nc ${OUT_PATH}/redef1.nc - -${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/testfile.nc -rm -f ${OUT_PATH}/redef1.nc -rm -f ${OUT_PATH}/testfile.nc - -# echo "" - -if test "x${ENABLE_BURST_BUFFER}" = x1 ; then - echo "" - echo "---- testing burst buffering" +# PNCIO driver does not support vard APIs +if test "x$exe_name" = xtest_vard || + test "x$exe_name" = xtest_vard_multiple || + test "x$exe_name" = xtest_vard_rec || + test "x$exe_name" = xtest_vardf90 || + test "x$exe_name" = xtest_vardf ; then + export PNETCDF_HINTS="nc_pncio=disable" +fi - # Run using burst buffer driver - export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" - ${TESTSEQRUN} ./put_all_kinds ${TESTOUTDIR}/put_all_kinds.bb.nc - ${TESTSEQRUN} ./iput_all_kinds ${TESTOUTDIR}/iput_all_kinds.bb.nc - unset PNETCDF_HINTS +for j in ${safe_modes} ; do - # Compare - for i in 1 2 5 ; do - ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/put_all_kinds.nc$i ${TESTOUTDIR}/put_all_kinds.bb.nc$i - ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/iput_all_kinds.nc$i ${TESTOUTDIR}/iput_all_kinds.bb.nc$i - done -fi -rm -f ${OUT_PATH}/put_all_kinds.nc* -rm -f ${OUT_PATH}/put_all_kinds.bb.nc* -rm -f ${OUT_PATH}/iput_all_kinds.nc* -rm -f ${OUT_PATH}/iput_all_kinds.bb.nc* + export PNETCDF_SAFE_MODE=$j + if test "x$VERBOSE" = xyes || test "x$DRY_RUN" = xyes ; then + echo "Line ${LINENO}: PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE" + fi -# echo "" + run_cmd ./$1 -q -o ${TESTOUTDIR}/${exe_name}.nc -if test "${ENABLE_THREAD_SAFE}" = 1 ; then - # echo "---- testing thread safety" - for j in 0 1 ; do - export PNETCDF_SAFE_MODE=$j - # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" +done # safe_modes - ${TESTSEQRUN} ./tst_pthread ${TESTOUTDIR}/tst_pthread.nc - for i in 0 1 2 3 4 5 ; do - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/tst_pthread.nc.$i - rm -f ${OUT_PATH}/tst_pthread.nc.$i - done - done -fi diff --git a/test/testcases/test_erange.c b/test/testcases/test_erange.c index 5d354f6711..b5bb09d369 100644 --- a/test/testcases/test_erange.c +++ b/test/testcases/test_erange.c @@ -10,7 +10,7 @@ * This program tests whether NC_ERANGE error code can be reported correctly. * Note in CDF-1 and CDF-2, a special case is made to NOT report NC_ERANGE * when the variable is of NC_BYTE type and the calling APIs are of uchar. See - * http://www.unidata.ucar.edu/software/netcdf/docs/data_type.html#type_conversion + * https://docs.unidata.ucar.edu/nug/current/md_types.html#data_type * * In CDF-5, NC_ERANGE is checked for when the external data type mismatches the * internal one. @@ -38,38 +38,51 @@ #include static -int test_cdf12(char *filename, int bb_enabled, int cmode) +int test_cdf12(const char *filename, int format, int coll_io, MPI_Info info) { - int err, nerrs=0, ncid, vid, fvid, dvid, dimid; - unsigned char uc[1]; - signed char sc[1]; - int si[1]; + char val[MPI_MAX_INFO_VAL]; + int err, nerrs=0, ncid, vid, fvid, dvid, dimid, flag, bb_enabled=0; + unsigned char uc; + signed char sc; + int si; double dbuf; - cmode |= NC_CLOBBER; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); CHECK_ERR + /* check whether burst buffering is enabled */ + MPI_Info_get(info, "nc_burst_buf", MPI_MAX_INFO_VAL - 1, val, &flag); + if (flag && strcasecmp(val, "enable") == 0) { + bb_enabled = 1; + /* does not work for NetCDF4 files when burst-buffering is enabled */ + if (format == NC_FORMAT_NETCDF4 || format == NC_FORMAT_NETCDF4_CLASSIC) + return 0; + } + + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, info, &ncid); CHECK_ERR /* for CDF-1 and CDF-2, a special case is made: there is no NC_ERANGE * error can occur converting between NC_BYTE and unsigned char. - * http://www.unidata.ucar.edu/software/netcdf/docs/data_type.html#type_conversion + * https://docs.unidata.ucar.edu/nug/current/md_types.html#data_type * In brief, NC_BYTE is signed in all signed CDF-2 APIs, and unsigned in * all unsigned APIs. In CDF-2, there is only one unsigned API, _uchar. */ - uc[0] = 255; - err = ncmpi_put_att_uchar(ncid, NC_GLOBAL, "att1", NC_BYTE, 1, uc); CHECK_ERR - uc[0] = 0; /* initialize with a number that is not 0 */ - err = ncmpi_get_att_uchar(ncid, NC_GLOBAL, "att1", uc); CHECK_ERR - if (uc[0] != 255) { - printf("Error at line %d: unexpected read value %d (expecting 255)\n",__LINE__,(int)uc[0]); - nerrs++; + uc = 255; + err = ncmpi_put_att_uchar(ncid, NC_GLOBAL, "att1", NC_BYTE, 1, &uc); CHECK_ERR + uc = 0; /* initialize with a number that is not 0 */ + err = ncmpi_get_att_uchar(ncid, NC_GLOBAL, "att1", &uc); CHECK_ERR + if (uc != 255) { + printf("Error at line %d: unexpected read value %d (expecting 255)\n",__LINE__,(int)uc); + assert(0); } - sc[0] = 3; /* initialize with a number that is not -1 or -0 */ + sc = 3; /* initialize with a number that is not -1 or -0 */ /* No NC_ERANGE as the internal and external types are considered the same */ - err = ncmpi_get_att_schar(ncid, NC_GLOBAL, "att1", sc); CHECK_ERR - if ( sc[0] != -1 /* 2-complement bit representation */ - && sc[0] != -0) { /* 1-complement bit representation */ - printf("Error at line %d: unexpected read value %d (expecting 255)\n",__LINE__,(int)uc[0]); - nerrs++; + err = ncmpi_get_att_schar(ncid, NC_GLOBAL, "att1", &sc); CHECK_ERR + if ( sc != -1 /* 2-complement bit representation */ + && sc != -0) { /* 1-complement bit representation */ + printf("Error at line %d: unexpected read value %d (expecting 255)\n",__LINE__,(int)sc); + assert(0); } /* expect NC_ERANGE */ @@ -82,14 +95,14 @@ int test_cdf12(char *filename, int bb_enabled, int cmode) CHECK_ERR #if defined(PNETCDF_ERANGE_FILL) && PNETCDF_ERANGE_FILL == 1 - if (! (cmode & NC_NETCDF4)) { + if (format != NC_FORMAT_NETCDF4 && format != NC_FORMAT_NETCDF4_CLASSIC) { float fbuf; /* read back attf and expect NC_FILL_FLOAT */ dbuf = 0.0; err = ncmpi_get_att_double(ncid, NC_GLOBAL, "attf", &dbuf); CHECK_ERR if (dbuf != NC_FILL_FLOAT) { printf("Error at line %d: unexpected read value %f (expecting %f)\n",__LINE__,dbuf,NC_FILL_FLOAT); - nerrs++; + assert(0); } /* read back attd as float and expect NC_ERANGE */ err = ncmpi_get_att_float(ncid, NC_GLOBAL, "attd", &fbuf); EXP_ERR(NC_ERANGE) @@ -97,72 +110,143 @@ int test_cdf12(char *filename, int bb_enabled, int cmode) #endif err = ncmpi_def_dim(ncid, "x", 1, &dimid); CHECK_ERR - err = ncmpi_def_var(ncid, "var_byte", NC_BYTE, 1, &dimid, &vid); CHECK_ERR + + /* NC_BYTE is an 8-bit signed integer, i.e. C type of signed char */ + err = ncmpi_def_var(ncid, "var_byte", NC_BYTE, 1, &dimid, &vid); + CHECK_ERR err = ncmpi_def_var(ncid, "var_flt", NC_FLOAT, 1, &dimid, &fvid); CHECK_ERR err = ncmpi_def_var(ncid, "var_dbl", NC_DOUBLE, 1, &dimid, &dvid); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* No NC_ERANGE should be returned for CDF-1 and 2 */ - uc[0] = 255; - err = ncmpi_put_var_uchar_all(ncid, vid, uc); CHECK_ERR - uc[0] = 3; /* initialize with a number that is not -1 or -0 */ - err = ncmpi_get_var_uchar_all(ncid, vid, uc); CHECK_ERR - if (uc[0] != 255) { - printf("Error at line %d: unexpected read value %d (expecting 255)\n",__LINE__,(int)uc[0]); - nerrs++; + /* range of unsigned char is from 0 to 255. PnetCDF should detect the value + * being out of range. But because NetCDF standard makes a special case for + * CDF-1 and CDF-2, which treats external NC_BYTE as unsigned char with no + * type conversion. + * + * https://docs.unidata.ucar.edu/nug/current/md_types.html#data_type + * The _uchar and _schar functions were introduced in netCDF-3 to eliminate + * an ambiguity, and support both signed and unsigned byte data. In + * netCDF-2, whether the external NC_BYTE type represented signed or + * unsigned values was left up to the user. In netcdf-3, we treat NC_BYTE + * as signed for the purposes of conversion to short, int, long, float, or + * double. (Of course, no conversion takes place when the internal type is + * signed char.) In the _uchar functions, we treat NC_BYTE as if it were + * unsigned. Thus, no NC_ERANGE error can occur converting between NC_BYTE + * and unsigned char. + */ + uc = 255; + if (coll_io) + err = ncmpi_put_var_uchar_all(ncid, vid, &uc); + else + err = ncmpi_put_var_uchar(ncid, vid, &uc); + CHECK_ERR + + uc = 3; /* set read buffer to a different value before read */ + if (coll_io) + err = ncmpi_get_var_uchar_all(ncid, vid, &uc); + else + err = ncmpi_get_var_uchar(ncid, vid, &uc); + CHECK_ERR + + /* In netCDF-2, whether the external NC_BYTE type represented signed or + * unsigned values was left up to the user. + */ + if (format != NC_FORMAT_CLASSIC && format != NC_FORMAT_64BIT_OFFSET && uc != 255) { + printf("Error %s at %d: uc[0] expect 255 but got %d\n", + basename(__FILE__),__LINE__,(int)uc); + assert(0); } /* No NC_ERANGE should be returned for CDF-1 and 2 */ - sc[0] = -128; - err = ncmpi_put_var_schar_all(ncid, vid, sc); CHECK_ERR - sc[0] = 0; - err = ncmpi_get_var_schar_all(ncid, vid, sc); CHECK_ERR - if (sc[0] != -128) { - printf("Error at line %d: unexpected read value %d (expecting -128)\n",__LINE__,(int)sc[0]); - nerrs++; + sc = -128; + if (coll_io) + err = ncmpi_put_var_schar_all(ncid, vid, &sc); + else + err = ncmpi_put_var_schar(ncid, vid, &sc); + CHECK_ERR + + sc = 0; /* set read buffer to a different value before read */ + if (coll_io) + err = ncmpi_get_var_schar_all(ncid, vid, &sc); + else + err = ncmpi_get_var_schar(ncid, vid, &sc); + CHECK_ERR + + if (format != NC_FORMAT_CLASSIC && format != NC_FORMAT_64BIT_OFFSET && sc != -128) { + printf("Error %s at %d: sc expect -128 but got %d\n", + basename(__FILE__),__LINE__,(int)sc); + assert(0); } /* expect NC_ERANGE */ - si[0] = -129; - err = ncmpi_put_var_int_all(ncid, vid, si); + si = -129; + if (coll_io) + err = ncmpi_put_var_int_all(ncid, vid, &si); + else + err = ncmpi_put_var_int(ncid, vid, &si); if (bb_enabled) { CHECK_ERR err = ncmpi_flush(ncid); } EXP_ERR(NC_ERANGE) - if (si[0] != -129) { /* check if put buffer content is altered */ - printf("Error at line %d: put buffer content altered %d (expecting -128)\n",__LINE__,si[0]); - nerrs++; + if (si != -129) { /* check if put buffer content is altered */ + printf("Error %s at %d: si expect -129 but got %d\n", + basename(__FILE__),__LINE__,si); + assert(0); } /* expect NC_ERANGE */ - si[0] = 256; - err = ncmpi_put_var_int_all(ncid, vid, si); + si = 256; + if (coll_io) + err = ncmpi_put_var_int_all(ncid, vid, &si); + else + err = ncmpi_put_var_int(ncid, vid, &si); if (bb_enabled) { CHECK_ERR err = ncmpi_flush(ncid); } EXP_ERR(NC_ERANGE) - if (si[0] != 256) { /* check if put buffer content is altered */ - printf("Error at line %d: put buffer content altered %d (expecting 256)\n",__LINE__,si[0]); - nerrs++; + if (si != 256) { /* check if put buffer content is altered */ + printf("Error %s at %d: si expect 256 but got %d\n", + basename(__FILE__),__LINE__,si); + assert(0); } /* expect no error */ - si[0] = -128; - err = ncmpi_put_var_int_all(ncid, vid, si); CHECK_ERR - si[0] = 0; - err = ncmpi_get_var_int_all(ncid, vid, si); CHECK_ERR - if (si[0] != -128) { - printf("Error at line %d: unexpected read value %d (expecting -128)\n",__LINE__,si[0]); - nerrs++; + si = -128; + if (coll_io) + err = ncmpi_put_var_int_all(ncid, vid, &si); + else + err = ncmpi_put_var_int(ncid, vid, &si); + CHECK_ERR + + si = 0; /* set read buffer to a different value before read */ + if (coll_io) + err = ncmpi_get_var_int_all(ncid, vid, &si); + else + err = ncmpi_get_var_int(ncid, vid, &si); + CHECK_ERR + + if (format != NC_FORMAT_CLASSIC && format != NC_FORMAT_64BIT_OFFSET && si != -128) { + printf("Error %s at %d: si expect -128 but got %d\n", + basename(__FILE__),__LINE__,si); + assert(0); } /* expect NC_ERANGE */ dbuf = NC_MAX_DOUBLE/2.0; - err = ncmpi_put_var_double_all(ncid, fvid, &dbuf); + if (coll_io) + err = ncmpi_put_var_double_all(ncid, fvid, &dbuf); + else + err = ncmpi_put_var_double(ncid, fvid, &dbuf); if (bb_enabled) { CHECK_ERR err = ncmpi_flush(ncid); @@ -170,21 +254,36 @@ int test_cdf12(char *filename, int bb_enabled, int cmode) EXP_ERR(NC_ERANGE) /* write a value > NC_MAX_FLOAT */ - err = ncmpi_put_var_double_all(ncid, dvid, &dbuf); CHECK_ERR + if (coll_io) + err = ncmpi_put_var_double_all(ncid, dvid, &dbuf); + else + err = ncmpi_put_var_double(ncid, dvid, &dbuf); + CHECK_ERR + #if defined(PNETCDF_ERANGE_FILL) && PNETCDF_ERANGE_FILL == 1 - if (! (cmode & NC_NETCDF4)) { + if (format != NC_FORMAT_NETCDF4 && format != NC_FORMAT_NETCDF4_CLASSIC) { float fbuf; /* read back and expect NC_FILL_FLOAT */ - dbuf = 0.0; - err = ncmpi_get_var_double_all(ncid, fvid, &dbuf); CHECK_ERR + dbuf = 0.0; /* set read buffer to a different value before read */ + if (coll_io) + err = ncmpi_get_var_double_all(ncid, fvid, &dbuf); + else + err = ncmpi_get_var_double(ncid, fvid, &dbuf); + CHECK_ERR if (dbuf != NC_FILL_FLOAT) { - printf("Error at line %d: unexpected read value %f (expecting %f)\n",__LINE__,dbuf,NC_FILL_FLOAT); - nerrs++; + printf("Error %s at %d: dbuf expect NC_FILL_FLOAT (%f) but got %f\n", + basename(__FILE__),__LINE__,NC_FILL_FLOAT,dbuf); + assert(0); } /* read back dvid as float and expect NC_ERANGE */ - err = ncmpi_get_var_float_all(ncid, dvid, &fbuf); EXP_ERR(NC_ERANGE) + fbuf = 0.0; /* set read buffer to a different value before read */ + if (coll_io) + err = ncmpi_get_var_float_all(ncid, dvid, &fbuf); + else + err = ncmpi_get_var_float(ncid, dvid, &fbuf); + EXP_ERR(NC_ERANGE) } #endif @@ -194,123 +293,147 @@ int test_cdf12(char *filename, int bb_enabled, int cmode) } static -int test_cdf345(char *filename, int bb_enabled, int cmode) +int test_cdf345(const char *filename, int format, int coll_io, MPI_Info info) { - int err, nerrs=0, ncid, uc_vid, sc_vid, dimid; - unsigned char uc[1]; - signed char sc[1]; + char val[MPI_MAX_INFO_VAL]; + int err, nerrs=0, ncid, uc_vid, sc_vid, dimid, flag, bb_enabled=0; + unsigned char uc; + signed char sc; - cmode |= NC_CLOBBER; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); CHECK_ERR + /* check whether burst buffering is enabled */ + MPI_Info_get(info, "nc_burst_buf", MPI_MAX_INFO_VAL - 1, val, &flag); + if (flag && strcasecmp(val, "enable") == 0) { + bb_enabled = 1; + /* does not work for NetCDF4 files when burst-buffering is enabled */ + if (format == NC_FORMAT_NETCDF4 || format == NC_FORMAT_NETCDF4_CLASSIC) + return 0; + } + + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, info, &ncid); CHECK_ERR /* CDF-5 considers NC_BYTE a signed 1-byte integer and NC_UBYTE an * unsigned 1-byte integer. The special case in CDF-2 for skipping * NC_ERANGE checking for converting between NC_BYTE and unsigned * char is no longer held. */ - uc[0] = 255; - err = ncmpi_put_att_uchar(ncid, NC_GLOBAL, "att1", NC_UBYTE, 1, uc); CHECK_ERR + uc = 255; + err = ncmpi_put_att_uchar(ncid, NC_GLOBAL, "att1", NC_UBYTE, 1, &uc); CHECK_ERR /* in CDF-5, get 255 to a schar buffer should result in NC_ERANGE */ - err = ncmpi_get_att_schar(ncid, NC_GLOBAL, "att1", sc); EXP_ERR(NC_ERANGE) + err = ncmpi_get_att_schar(ncid, NC_GLOBAL, "att1", &sc); EXP_ERR(NC_ERANGE) - sc[0] = -1; /* a value should cause NC_ERANGE */ - err = ncmpi_put_att_schar(ncid, NC_GLOBAL, "att2", NC_UBYTE, 1, sc); EXP_ERR(NC_ERANGE) + sc = -1; /* a value should cause NC_ERANGE */ + err = ncmpi_put_att_schar(ncid, NC_GLOBAL, "att2", NC_UBYTE, 1, &sc); EXP_ERR(NC_ERANGE) err = ncmpi_def_dim(ncid, "x", 1, &dimid); CHECK_ERR err = ncmpi_def_var(ncid, "var_ubyte", NC_UBYTE, 1, &dimid, &uc_vid); CHECK_ERR err = ncmpi_def_var(ncid, "var_byte", NC_BYTE, 1, &dimid, &sc_vid); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR - uc[0] = 255; - err = ncmpi_put_var_uchar_all(ncid, uc_vid, uc); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + + uc = 255; + if (coll_io) + err = ncmpi_put_var_uchar_all(ncid, uc_vid, &uc); + else + err = ncmpi_put_var_uchar(ncid, uc_vid, &uc); + CHECK_ERR /* in CDF-5, get 255 to an schar should result in NC_ERANGE */ - err = ncmpi_get_var_schar_all(ncid, uc_vid, sc); EXP_ERR(NC_ERANGE) + if (coll_io) + err = ncmpi_get_var_schar_all(ncid, uc_vid, &sc); + else + err = ncmpi_get_var_schar(ncid, uc_vid, &sc); + EXP_ERR(NC_ERANGE) - sc[0] = -1; /* in CDF-5, put -1 to an uchar should result in NC_ERANGE */ - err = ncmpi_put_var_schar_all(ncid, uc_vid, sc); + sc = -1; /* in CDF-5, put -1 to an uchar should result in NC_ERANGE */ + if (coll_io) + err = ncmpi_put_var_schar_all(ncid, uc_vid, &sc); + else + err = ncmpi_put_var_schar(ncid, uc_vid, &sc); if (bb_enabled) { CHECK_ERR err = ncmpi_flush(ncid); } EXP_ERR(NC_ERANGE) - uc[0] = 255; /* in CDF-5, put 255 to a schar should result in NC_ERANGE */ - err = ncmpi_put_var_uchar_all(ncid, sc_vid, uc); + uc = 255; /* in CDF-5, put 255 to a schar should result in NC_ERANGE */ + if (coll_io) + err = ncmpi_put_var_uchar_all(ncid, sc_vid, &uc); + else + err = ncmpi_put_var_uchar(ncid, sc_vid, &uc); if (bb_enabled) { CHECK_ERR err = ncmpi_flush(ncid); } EXP_ERR(NC_ERANGE) - sc[0] = -1; - err = ncmpi_put_var_schar_all(ncid, sc_vid, sc); CHECK_ERR - uc[0] = 0; /* in CDF-5, get -1 to an uchar should result in NC_ERANGE */ - err = ncmpi_get_var_uchar_all(ncid, sc_vid, uc); EXP_ERR(NC_ERANGE) + sc = -1; + if (coll_io) + err = ncmpi_put_var_schar_all(ncid, sc_vid, &sc); + else + err = ncmpi_put_var_schar(ncid, sc_vid, &sc); + CHECK_ERR + + uc = 0; /* in CDF-5, get -1 to an uchar should result in NC_ERANGE */ + if (coll_io) + err = ncmpi_get_var_uchar_all(ncid, sc_vid, &uc); + else + err = ncmpi_get_var_uchar(ncid, sc_vid, &uc); + EXP_ERR(NC_ERANGE) err = ncmpi_close(ncid); CHECK_ERR return nerrs; } -int main(int argc, char* argv[]) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - char filename[256], *hint_value; - int err, nerrs=0, rank, bb_enabled=0; + int nerrs=0; - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (format == NC_FORMAT_CLASSIC || + format == NC_FORMAT_64BIT_OFFSET || + format == NC_FORMAT_NETCDF4_CLASSIC) + nerrs += test_cdf12(out_path, format, coll_io, info); + else + nerrs += test_cdf345(out_path, format, coll_io, info); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for checking for NC_ERANGE ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + return nerrs; +} - /* check whether burst buffering is enabled */ - if (inq_env_hint("nc_burst_buf", &hint_value)) { - if (strcasecmp(hint_value, "enable") == 0) bb_enabled = 1; - free(hint_value); - } +int main(int argc, char **argv) { - nerrs += test_cdf12(filename, bb_enabled, 0); - nerrs += test_cdf12(filename, bb_enabled, NC_64BIT_OFFSET); -#if ENABLE_NETCDF4 - if (!bb_enabled) { - nerrs += test_cdf12(filename, bb_enabled, NC_NETCDF4 | NC_CLASSIC_MODEL); - nerrs += test_cdf345(filename, bb_enabled, NC_NETCDF4); - } -#endif - nerrs += test_cdf345(filename, bb_enabled, NC_64BIT_DATA); - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + int err; + loop_opts opt; - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "NC_ERANGE", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/testcases/test_fillvalue.c b/test/testcases/test_fillvalue.c index f79b85048c..f12890d647 100644 --- a/test/testcases/test_fillvalue.c +++ b/test/testcases/test_fillvalue.c @@ -38,15 +38,30 @@ #include -static int -tst_fmt(char *filename, int cmode) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) { - int err, nerrs=0, ncid, varid, int_buf; + char val[MPI_MAX_INFO_VAL]; + int err, nerrs=0, flag, ncid, varid, int_buf; float flt_buf; + /* check whether burst buffering is enabled */ + MPI_Info_get(info, "nc_burst_buf", MPI_MAX_INFO_VAL - 1, val, &flag); + if (flag && strcasecmp(val, "enable") == 0 && + (format == NC_FORMAT_NETCDF4 || format == NC_FORMAT_NETCDF4_CLASSIC)) + /* does not work for NetCDF4 files when burst-buffering is enabled */ + return 0; + + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + /* create a file */ - cmode |= NC_CLOBBER; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR flt_buf = 1.234; @@ -71,61 +86,26 @@ tst_fmt(char *filename, int cmode) } int main(int argc, char **argv) { - char filename[256], *hint_value; - int err, nerrs=0, rank, bb_enabled=0; + + int err; + loop_opts opt; MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for _FillValue for NC_GLOBAL ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - /* check whether burst buffering is enabled */ - if (inq_env_hint("nc_burst_buf", &hint_value)) { - if (strcasecmp(hint_value, "enable") == 0) bb_enabled = 1; - free(hint_value); - } - - nerrs += tst_fmt(filename, 0); - nerrs += tst_fmt(filename, NC_64BIT_OFFSET); - if (!bb_enabled) { -#ifdef ENABLE_NETCDF4 - nerrs += tst_fmt(filename, NC_NETCDF4); - nerrs += tst_fmt(filename, NC_NETCDF4 | NC_CLASSIC_MODEL); -#endif - } - nerrs += tst_fmt(filename, NC_64BIT_DATA); - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } - - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "_FillValue for NC_GLOBAL", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/testcases/test_get_varn.c b/test/testcases/test_get_varn.c index 2d377f3d79..3d3a7ef15a 100644 --- a/test/testcases/test_get_varn.c +++ b/test/testcases/test_get_varn.c @@ -22,38 +22,26 @@ #define NDIMS 3 -int main(int argc, char** argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - int i, j, rank, nprocs, err, nerrs = 0; + int i, j, rank, err, nerrs = 0; int ncid, varid, num_reqs; double *buffer; float *fbuffer; MPI_Offset r_len, **starts = NULL, **counts = NULL; MPI_Offset st[3], ct[3]; - char filename[256]; int dimids[3]; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for get_varn ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } #ifdef DEBUG + int nprocs; + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); if (nprocs != 4 && rank == 0) printf("Warning: %s is designed to run on 4 process\n",argv[0]); #endif @@ -69,13 +57,22 @@ int main(int argc, char** argv) * float lnfm(time, lat, lon) ; * } */ - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER|NC_64BIT_DATA, MPI_INFO_NULL, &ncid); CHECK_ERR + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "time", NC_UNLIMITED, &dimids[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "lat", 94, &dimids[1]); CHECK_ERR err = ncmpi_def_dim(ncid, "lon", 192, &dimids[2]); CHECK_ERR err = ncmpi_def_var(ncid, "lnfm", NC_FLOAT, 3, dimids, &varid); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + st[0] = rank*2; st[1] = 0; st[2] = 0; @@ -85,15 +82,30 @@ int main(int argc, char** argv) ct[2] = 192; float *scramble = (float*) calloc(ct[0]*ct[1]*ct[2], sizeof(float)); - err = ncmpi_put_vara_float_all(ncid, varid, st, ct, scramble); CHECK_ERR + if (coll_io) + err = ncmpi_put_vara_float_all(ncid, varid, st, ct, scramble); + else + err = ncmpi_put_vara_float(ncid, varid, st, ct, scramble); + CHECK_ERR + + /* file sync before reading */ + err = ncmpi_sync(ncid); + CHECK_ERR + MPI_Barrier(MPI_COMM_WORLD); + err = ncmpi_close(ncid); CHECK_ERR free(scramble); /* now we can finally exercise the read path of this record varable */ - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid); + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* pick 2 requests for 4 processes */ /* num_reqs = 1; => works fine*/ num_reqs = 2; @@ -136,8 +148,11 @@ int main(int argc, char** argv) /* set the buffer pointers to different offsets to the I/O buffer */ varid = 0; /* only one variable in lnfm.nc */ - err = ncmpi_get_varn_double_all(ncid, varid, num_reqs, starts, counts, buffer); - /* err = ncmpi_get_varn_float_all(ncid, varid, num_reqs, starts, counts, fbuffer); */ + if (coll_io) + err = ncmpi_get_varn_double_all(ncid, varid, num_reqs, starts, counts, buffer); + /* err = ncmpi_get_varn_float_all(ncid, varid, num_reqs, starts, counts, fbuffer); */ + else + err = ncmpi_get_varn_double(ncid, varid, num_reqs, starts, counts, buffer); CHECK_ERR err = ncmpi_close(ncid); @@ -160,24 +175,31 @@ int main(int argc, char** argv) free(starts); free(counts); - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "get_varn", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/test_vard.c b/test/testcases/test_vard.c index 5ea0ded0f5..216b59226e 100644 --- a/test/testcases/test_vard.c +++ b/test/testcases/test_vard.c @@ -64,6 +64,7 @@ if (buf[j][i] != val+i) { \ printf("line %d: expecting buf[%d][%d]=%d but got %d\n",__LINE__,j,i,val+i,buf[j][i]); \ nerrs++; \ + goto fn_exit; \ } \ } \ } \ @@ -74,12 +75,14 @@ if (buf[j][i] != rank*100+j*10+i) { \ printf("line %d: expecting buf[%d][%d]=%d but got %d\n",__LINE__,j,i,rank*100+j*10+i,(int)buf[j][i]); \ nerrs++; \ + goto fn_exit; \ } \ } \ } static int get_var_and_verify(int ncid, + int coll_io, int varid, MPI_Offset *start, MPI_Offset *count, @@ -97,7 +100,11 @@ int get_var_and_verify(int ncid, for (j=0; j------------------------------------------------------------*/ -int main(int argc, char **argv) { - - char filename[256]; +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ int i, j, err, ncid, varid0, varid1, varid2, dimids[2], nerrs=0; int rank, nprocs, blocklengths[2], **buf, *bufptr; int array_of_sizes[2], array_of_subsizes[2], array_of_starts[2]; - int buftype_size, expected_put_size, format; - float **flt_buf, *flt_bufptr; - double **dbl_buf, *dbl_bufptr; + int buftype_size, expected_put_size, fmt; + float **flt_buf=NULL, *flt_bufptr; + double **dbl_buf=NULL, *dbl_bufptr; MPI_Offset start[2], count[2], header_size, put_size, new_put_size; MPI_Aint a0, a1, disps[2]; MPI_Datatype buftype, ghost_buftype, rec_filetype, fix_filetype; MPI_Datatype flt_buftype, dbl_buftype; - MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for vard put and get ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - /* construct various MPI derived data types */ buf = (int**)malloc(sizeof(int*) * NY); @@ -226,9 +238,13 @@ int main(int argc, char **argv) { MPI_INT, &ghost_buftype); MPI_Type_commit(&ghost_buftype); + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + /* create a new file for write */ - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, MPI_INFO_NULL, - &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR /* define a 2D array */ err = ncmpi_def_dim(ncid, "REC_DIM", NC_UNLIMITED, &dimids[0]); CHECK_ERR @@ -246,6 +262,11 @@ int main(int argc, char **argv) { err = ncmpi_fill_var_rec(ncid, varid2, 0); CHECK_ERR err = ncmpi_fill_var_rec(ncid, varid2, 1); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* create a file type for the record variable */ int *array_of_blocklengths=(int*) malloc(sizeof(int) * count[0]); MPI_Aint *array_of_displacements=(MPI_Aint*) malloc(sizeof(MPI_Aint) * count[0]); @@ -271,18 +292,22 @@ int main(int argc, char **argv) { err = ncmpi_inq_put_size(ncid, &put_size); CHECK_ERR /* write the record variable */ - err = ncmpi_put_vard_all(ncid, varid0, rec_filetype, bufptr, 1, buftype); CHECK_ERR + if (coll_io) + err = ncmpi_put_vard_all(ncid, varid0, rec_filetype, bufptr, 1, buftype); + else + err = ncmpi_put_vard(ncid, varid0, rec_filetype, bufptr, 1, buftype); + CHECK_ERR /* check if put_size is correctly reported */ err = ncmpi_inq_put_size(ncid, &new_put_size); CHECK_ERR MPI_Type_size(buftype, &buftype_size); - err = ncmpi_inq_format(ncid, &format); CHECK_ERR + err = ncmpi_inq_format(ncid, &fmt); CHECK_ERR expected_put_size = buftype_size; /* for writing a record variable, root process will update numrec to the * file header, However, because the first 2 records have been filled * above, root process need not write to file header. - if (rank == 0) expected_put_size += (format == NC_FORMAT_CDF5) ? 8 : 4; + if (rank == 0) expected_put_size += (fmt == NC_FORMAT_CDF5) ? 8 : 4; */ if (expected_put_size != new_put_size - put_size) { printf("Error at line %d in %s: unexpected put size ("OFFFMT") reported, expecting %d\n", @@ -299,7 +324,11 @@ int main(int argc, char **argv) { err = ncmpi_inq_put_size(ncid, &put_size); CHECK_ERR /* write the fixed-size variable */ - err = ncmpi_put_vard_all(ncid, varid1, fix_filetype, bufptr, 1, buftype); CHECK_ERR + if (coll_io) + err = ncmpi_put_vard_all(ncid, varid1, fix_filetype, bufptr, 1, buftype); + else + err = ncmpi_put_vard(ncid, varid1, fix_filetype, bufptr, 1, buftype); + CHECK_ERR /* check if put_size is correctly reported */ err = ncmpi_inq_put_size(ncid, &new_put_size); CHECK_ERR @@ -326,17 +355,27 @@ int main(int argc, char **argv) { err = ncmpi_rename_var(ncid, varid0, "rec_var"); CHECK_ERR err = ncmpi_end_indep_data(ncid); CHECK_ERR + /* file sync before reading */ + err = ncmpi_sync(ncid); + CHECK_ERR + MPI_Barrier(MPI_COMM_WORLD); + err = ncmpi_close(ncid); CHECK_ERR /* open the same file and read back for validate */ - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_WRITE, MPI_INFO_NULL, - &ncid); CHECK_ERR + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_WRITE, info, &ncid); + CHECK_ERR + + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } err = ncmpi_inq_varid(ncid, "rec_var", &varid0); CHECK_ERR err = ncmpi_inq_varid(ncid, "fix_var", &varid1); CHECK_ERR - nerrs += get_var_and_verify(ncid, varid0, start, count, buf, buftype, ghost_buftype, rec_filetype); - nerrs += get_var_and_verify(ncid, varid1, start, count, buf, buftype, ghost_buftype, fix_filetype); + nerrs += get_var_and_verify(ncid, coll_io, varid0, start, count, buf, buftype, ghost_buftype, rec_filetype); + nerrs += get_var_and_verify(ncid, coll_io, varid1, start, count, buf, buftype, ghost_buftype, fix_filetype); /* test type conversion from float to int */ flt_buf = (float**)malloc(sizeof(float*) * NY); @@ -357,13 +396,21 @@ int main(int argc, char **argv) { MPI_Type_commit(&flt_buftype); /* write the record variable with type conversion from float to int */ - err = ncmpi_put_vard_all(ncid, varid0, rec_filetype, flt_bufptr, 1, flt_buftype); CHECK_ERR + if (coll_io) + err = ncmpi_put_vard_all(ncid, varid0, rec_filetype, flt_bufptr, 1, flt_buftype); + else + err = ncmpi_put_vard(ncid, varid0, rec_filetype, flt_bufptr, 1, flt_buftype); + CHECK_ERR CHECK_VALUE(flt_buf) - nerrs += get_var_and_verify(ncid, varid0, start, count, buf, buftype, ghost_buftype, rec_filetype); + nerrs += get_var_and_verify(ncid, coll_io, varid0, start, count, buf, buftype, ghost_buftype, rec_filetype); /* read the record variable with type conversion from int to float */ for (j=0; j 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 0; /* test intra-node aggregation */ + opt.drv = 0; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "vard APIs", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/testcases/test_vard_multiple.c b/test/testcases/test_vard_multiple.c index 1e47dbeb48..d2f33d9739 100644 --- a/test/testcases/test_vard_multiple.c +++ b/test/testcases/test_vard_multiple.c @@ -69,43 +69,29 @@ if ((buf)[j*NX+i] != (base)+rank*100+j*10+i) { \ printf("line %d: expecting buf[%d*NX+%d]=%d but got %d\n",\ __LINE__,j,i,(base)+rank*100+j*10+i,(buf)[j*NX+i]); \ - nerrs++; \ + assert(0); \ } \ } \ } -/*----< main() >------------------------------------------------------------*/ -int main(int argc, char **argv) { - - char filename[256]; - int i, j, err, ncid, varid[4], dimids[3], nerrs=0, unlimit_dimid; - int rank, nprocs, *buf[2]; - int array_of_sizes[2], array_of_subsizes[2], array_of_starts[2]; - int array_of_blocklengths[NY]; - MPI_Offset len, recsize, start[2], count[2], offset[2]; - MPI_Aint a0, a1, array_of_displacements[NY]; +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ + int i, j, err, ncid, varid[4], dimids[3], nerrs=0, unlimit_dimid; + int rank, nprocs, *buf[2]; + int array_of_sizes[2], array_of_subsizes[2], array_of_starts[2]; + int array_of_blocklengths[NY]; + MPI_Offset len, recsize, start[2], count[2], offset[2]; + MPI_Aint a0, a1, array_of_displacements[NY]; MPI_Datatype buftype, vtype[2], filetype; - MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for vard to 2 variables ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - buf[0] = (int*)malloc(sizeof(int) * NY * NX); for (j=0; j 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 0; /* test intra-node aggregation */ + opt.drv = 0; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "2 var in one call", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/testcases/test_vard_rec.c b/test/testcases/test_vard_rec.c index b5c75f04a0..3227490e5d 100644 --- a/test/testcases/test_vard_rec.c +++ b/test/testcases/test_vard_rec.c @@ -12,6 +12,9 @@ * to test the fix to bug reported by Jim Edwards in r3675. * * % mpiexec -n 4 test_vard_rec + * + * When setting NX to 3, below shows the expected file contents. + * * % ncmpidump /pvfs2/wkliao/testfile.nc * netcdf testfile { * // file format: CDF-1 @@ -38,43 +41,33 @@ #include #define NY 2 -#define NX 3 - -/*----< main() >------------------------------------------------------------*/ -int main(int argc, char **argv) { - - char filename[256]; +#define NX 100 + +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ int i, j, err, nerrs=0, ncid, varid, dimids[2], unlimit_dimid; int rank, nprocs, verbose, array_of_blocklengths[2], buf[NY][NX]; MPI_Offset recsize, len; MPI_Aint array_of_displacements[2]; MPI_Datatype rec_filetype; - MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); verbose = 0; - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for vard put on record var ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR /* create a new file for write */ - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, MPI_INFO_NULL, - &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR /* define a 2D array */ err = ncmpi_def_dim(ncid, "REC_DIM", NC_UNLIMITED, &dimids[0]); CHECK_ERR @@ -82,6 +75,11 @@ int main(int argc, char **argv) { err = ncmpi_def_var(ncid, "rec_var", NC_INT, 2, dimids, &varid); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* initialize the contents of the array */ for (j=0; j 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 0; /* test intra-node aggregation */ + opt.drv = 0; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "vard put on record var", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/testcases/test_vardf.F b/test/testcases/test_vardf.F index 44e30fa998..77f35432c2 100644 --- a/test/testcases/test_vardf.F +++ b/test/testcases/test_vardf.F @@ -64,7 +64,7 @@ subroutine check(err, message) if (err .NE. NF_NOERR) then write(6,*) message(1:XTRIM(message)), nfmpi_strerror(err) msg = '*** TESTING F77 test_vardf.f for vard API ' - call pass_fail(1, msg) + call pass_fail(1, msg, 0) call MPI_Abort(MPI_COMM_WORLD, -1, err) end if end ! subroutine check @@ -193,7 +193,7 @@ program main implicit none include "mpif.h" include "pnetcdf.inc" - character(LEN=256) filename, cmd, msg + character(LEN=256) out_path, in_path, cmd, msg integer err, ierr, nprocs, rank, i, j, get_args integer cmode, ncid, varid0, varid1, varid2, dimid(2), nerrs integer NX, NY, XTRIM, old_fillmode @@ -209,24 +209,31 @@ program main #else integer*8 a0, a1, disps(2) #endif + logical keep_files + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) one = 1 two = 2 - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any if (rank .EQ. 0) then - filename = "testfile.nc" - err = get_args(cmd, filename) + out_path = "testfile.nc" + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, + MPI_COMM_WORLD, ierr) + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, + + ierr) nerrs = 0 start(1) = NX * rank @@ -278,7 +285,7 @@ program main ! create file, truncate it if exists cmode = NF_CLOBBER - err = nfmpi_create(MPI_COMM_WORLD, filename, cmode, + err = nfmpi_create(MPI_COMM_WORLD, out_path, cmode, + MPI_INFO_NULL, ncid) call check(err, 'In nfmpi_create: ') @@ -374,7 +381,7 @@ program main call check(err, 'In nfmpi_close: ') ! open the same file and read back for validate */ - err = nfmpi_open(MPI_COMM_WORLD, filename, NF_NOWRITE, + err = nfmpi_open(MPI_COMM_WORLD, out_path, NF_NOWRITE, + MPI_INFO_NULL, ncid) call check(err, 'In nfmpi_open: ') @@ -413,10 +420,18 @@ program main + sum_size, ' bytes yet to be freed' endif + timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, + + MPI_DOUBLE_PRECISION, MPI_MAX, + + MPI_COMM_WORLD, ierr) if (rank .eq. 0) then + if (.NOT. keep_files) then + err = nfmpi_delete(out_path, MPI_INFO_NULL) + end if + msg = '*** TESTING F77 '//cmd(1:XTRIM(cmd))// - + ' for vard API ' - call pass_fail(nerrs, msg) + + ' - vard API ' + call pass_fail(nerrs, msg, timing) endif 999 call MPI_Finalize(ierr) diff --git a/test/testcases/test_vardf90.f90 b/test/testcases/test_vardf90.f90 index 8a78b3fdfe..80d740f03e 100644 --- a/test/testcases/test_vardf90.f90 +++ b/test/testcases/test_vardf90.f90 @@ -53,8 +53,8 @@ subroutine check(err, message) ! It is a good idea to check returned value for possible error if (err .NE. NF90_NOERR) then write(6,*) trim(message), trim(nf90mpi_strerror(err)) - msg = '*** TESTING F90 test_vardf90.f90 for vard API ' - call pass_fail(1, msg) + msg = '*** TESTING F90 test_vardf90.f90 - vard API ' + call pass_fail(1, msg, 0) call MPI_Abort(MPI_COMM_WORLD, -1, err) end if end subroutine check @@ -182,7 +182,7 @@ program main use pnetcdf implicit none - character(LEN=256) filename, cmd, msg + character(LEN=256) out_path, in_path, cmd, msg integer err, ierr, nprocs, rank, i, j, get_args integer cmode, ncid, varid0, varid1, varid2, dimid(2), nerrs integer NX, NY, old_fillmode @@ -194,20 +194,27 @@ program main integer(kind=MPI_OFFSET_KIND) start(2), count(2), recno integer(kind=MPI_OFFSET_KIND) len, malloc_size, sum_size, recsize integer(kind=MPI_ADDRESS_KIND) a0, a1, disps(2) + logical keep_files + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any if (rank .EQ. 0) then - filename = "testfile.nc" - err = get_args(cmd, filename) + out_path = "testfile.nc" + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) nerrs = 0 @@ -260,7 +267,7 @@ program main ! create file, truncate it if exists cmode = NF90_CLOBBER - err = nf90mpi_create(MPI_COMM_WORLD, filename, cmode, & + err = nf90mpi_create(MPI_COMM_WORLD, out_path, cmode, & MPI_INFO_NULL, ncid) call check(err, 'In nf90mpi_create: ') @@ -354,7 +361,7 @@ program main call check(err, 'In nf90mpi_close: ') ! open the same file and read back for validate */ - err = nf90mpi_open(MPI_COMM_WORLD, filename, NF90_NOWRITE, & + err = nf90mpi_open(MPI_COMM_WORLD, out_path, NF90_NOWRITE, & MPI_INFO_NULL, ncid) call check(err, 'In nf90mpi_open: ') @@ -391,9 +398,17 @@ program main sum_size, ' bytes yet to be freed' endif + timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) if (rank .eq. 0) then - msg = '*** TESTING F90 '//trim(cmd)//' for vard API ' - call pass_fail(nerrs, msg) + if (.NOT. keep_files) then + err = nfmpi_delete(out_path, MPI_INFO_NULL) + end if + + msg = '*** TESTING F90 '//trim(cmd)//' - vard API ' + call pass_fail(nerrs, msg, timing) endif 999 call MPI_Finalize(ierr) diff --git a/test/testcases/test_varm.c b/test/testcases/test_varm.c index 0248ec4000..73e20e1f56 100644 --- a/test/testcases/test_varm.c +++ b/test/testcases/test_varm.c @@ -14,6 +14,8 @@ #include +static int verbose; + static int check_read_contents(float *rh) { @@ -25,10 +27,9 @@ check_read_contents(float *rh) for (i=0; i<6; i++) { for (j=0; j<4; j++) { if (rh[j*6+i] != k) { -#ifdef PRINT_ERR_ON_SCREEN - printf("Error at %s:%d : expect rh[%d][%d]=%f but got %f\n", - __FILE__,__LINE__,j,i,k,rh[j*6+i]); -#endif + if (verbose) + printf("Error at %s:%d : expect rh[%d][%d]=%f but got %f\n", + __FILE__,__LINE__,j,i,k,rh[j*6+i]); return 1; } k += 1.0; @@ -71,11 +72,10 @@ check_write_contents(signed char *varT) for (j=0; j<4; j++) { for (i=0; i<6; i++) { if (varT[j*6+i] != j*6+i + 50) { -#ifdef PRINT_ERR_ON_SCREEN - /* this error is a pnetcdf internal error, if occurs */ - printf("Error at line %d in %s: expecting varT[%d][%d]=%d but got %d\n", - __LINE__,__FILE__,j,i,j*6+i + 50,varT[j*6+i]); -#endif + if (verbose) + /* this error is a pnetcdf internal error, if occurs */ + printf("Error at line %d in %s: expecting varT[%d][%d]=%d but got %d\n", + __LINE__,__FILE__,j,i,j*6+i + 50,varT[j*6+i]); return 1; } } @@ -83,10 +83,14 @@ check_write_contents(signed char *varT) return 0; } -static int -tst_fmt(char *filename, int cmode) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - int i, j, err, nerrs=0, rank, nprocs; + int i, j, err, nerrs=0, rank, nprocs, debug=0; int ncid, dimid[2], varid, req, status; MPI_Offset start[2], count[2], stride[2], imap[2]; int var[6][4]; @@ -96,8 +100,16 @@ tst_fmt(char *filename, int cmode) MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - cmode |= NC_CLOBBER; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); CHECK_ERR + if (debug && nprocs > 1 && rank == 0) + printf("Warning: %s is designed to run on 1 process\n", + basename(__FILE__)); + + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_FATAL_ERR /* define a variable of a 6 x 4 integer array in the nc file */ err = ncmpi_def_dim(ncid, "Y", 6, &dimid[0]); CHECK_ERR @@ -105,6 +117,11 @@ tst_fmt(char *filename, int cmode) err = ncmpi_def_var(ncid, "var", NC_INT, 2, dimid, &varid); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* create a 6 x 4 integer variable in the file with contents: 0, 1, 2, 3, 4, 5, 6, 7, @@ -118,13 +135,28 @@ tst_fmt(char *filename, int cmode) start[0] = 0; start[1] = 0; count[0] = 6; count[1] = 4; if (rank > 0) count[0] = count[1] = 0; - err = ncmpi_put_vara_int_all(ncid, varid, start, count, &var[0][0]); CHECK_ERR + if (coll_io) + err = ncmpi_put_vara_int_all(ncid, varid, start, count, &var[0][0]); + else + err = ncmpi_put_vara_int(ncid, varid, start, count, &var[0][0]); + CHECK_ERR if (nprocs > 1) MPI_Barrier(MPI_COMM_WORLD); + /* file sync before reading */ + err = ncmpi_sync(ncid); + CHECK_ERR + MPI_Barrier(MPI_COMM_WORLD); + err = ncmpi_close(ncid); CHECK_ERR - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid); CHECK_ERR + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_NOWRITE, info, &ncid); + CHECK_FATAL_ERR + + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } err = ncmpi_inq_varid(ncid, "var", &varid); CHECK_ERR @@ -133,16 +165,22 @@ tst_fmt(char *filename, int cmode) stride[0] = 1; stride[1] = 1; imap[0] = 1; imap[1] = 6; /* would be {4, 1} if not transposing */ - if (cmode & NC_NETCDF4) { + if (format == NC_FORMAT_NETCDF4 || format == NC_FORMAT_NETCDF4_CLASSIC) { for (i=0; i<6; i++) for (j=0; j<4; j++) rh[j][i] = -1.0; - err = ncmpi_get_varm_float_all(ncid, varid, start, count, stride, imap, - &rh[0][0]); CHECK_ERR + if (coll_io) + err = ncmpi_get_varm_float_all(ncid, varid, start, count, stride, imap, &rh[0][0]); + else + err = ncmpi_get_varm_float(ncid, varid, start, count, stride, imap, &rh[0][0]); + CHECK_ERR nerrs += check_read_contents(&rh[0][0]); /* test when stride == NULL and imap != NULL */ for (i=0; i<6; i++) for (j=0; j<4; j++) rh[j][i] = -1.0; - err = ncmpi_get_varm_float_all(ncid, varid, start, count, NULL, imap, - &rh[0][0]); CHECK_ERR + if (coll_io) + err = ncmpi_get_varm_float_all(ncid, varid, start, count, NULL, imap, &rh[0][0]); + else + err = ncmpi_get_varm_float(ncid, varid, start, count, NULL, imap, &rh[0][0]); + CHECK_ERR nerrs += check_read_contents(&rh[0][0]); } else { @@ -150,7 +188,11 @@ tst_fmt(char *filename, int cmode) for (i=0; i<6; i++) for (j=0; j<4; j++) rh[j][i] = -1.0; err = ncmpi_iget_varm_float(ncid, varid, start, count, stride, imap, &rh[0][0], &req); CHECK_ERR - err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR + if (coll_io) + err = ncmpi_wait_all(ncid, 1, &req, &status); + else + err = ncmpi_wait(ncid, 1, &req, &status); + CHECK_ERR err = status; CHECK_ERR nerrs += check_read_contents(&rh[0][0]); @@ -158,26 +200,42 @@ tst_fmt(char *filename, int cmode) for (i=0; i<6; i++) for (j=0; j<4; j++) rh[j][i] = -1.0; err = ncmpi_iget_varm_float(ncid, varid, start, count, NULL, imap, &rh[0][0], &req); CHECK_ERR - err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR + if (coll_io) + err = ncmpi_wait_all(ncid, 1, &req, &status); + else + err = ncmpi_wait(ncid, 1, &req, &status); + CHECK_ERR err = status; CHECK_ERR nerrs += check_read_contents(&rh[0][0]); /* test blocking API */ for (i=0; i<6; i++) for (j=0; j<4; j++) rh[j][i] = -1.0; - err = ncmpi_get_varm_float_all(ncid, varid, start, count, stride, imap, - &rh[0][0]); CHECK_ERR + if (coll_io) + err = ncmpi_get_varm_float_all(ncid, varid, start, count, stride, imap, &rh[0][0]); + else + err = ncmpi_get_varm_float(ncid, varid, start, count, stride, imap, &rh[0][0]); + CHECK_ERR nerrs += check_read_contents(&rh[0][0]); /* test when stride == NULL and imap != NULL */ for (i=0; i<6; i++) for (j=0; j<4; j++) rh[j][i] = -1.0; - err = ncmpi_get_varm_float_all(ncid, varid, start, count, NULL, imap, - &rh[0][0]); CHECK_ERR + if (coll_io) + err = ncmpi_get_varm_float_all(ncid, varid, start, count, NULL, imap, &rh[0][0]); + else + err = ncmpi_get_varm_float(ncid, varid, start, count, NULL, imap, &rh[0][0]); + CHECK_ERR nerrs += check_read_contents(&rh[0][0]); } err = ncmpi_close(ncid); CHECK_ERR - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_WRITE, MPI_INFO_NULL, &ncid); CHECK_ERR + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_WRITE, info, &ncid); + CHECK_FATAL_ERR + + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } err = ncmpi_inq_varid(ncid, "var", &varid); CHECK_ERR @@ -186,7 +244,11 @@ tst_fmt(char *filename, int cmode) start[0] = 0; start[1] = 0; count[0] = 6; count[1] = 4; if (rank > 0) count[0] = count[1] = 0; - err = ncmpi_put_vara_int_all(ncid, varid, start, count, &var[0][0]); CHECK_ERR + if (coll_io) + err = ncmpi_put_vara_int_all(ncid, varid, start, count, &var[0][0]); + else + err = ncmpi_put_vara_int(ncid, varid, start, count, &var[0][0]); + CHECK_ERR /* set the contents of the write buffer varT, a 4 x 6 char array 50, 51, 52, 53, 54, 55, @@ -203,34 +265,51 @@ tst_fmt(char *filename, int cmode) imap[0] = 1; imap[1] = 6; /* would be {4, 1} if not transposing */ if (rank > 0) count[0] = count[1] = 0; - if (cmode & NC_NETCDF4) { - err = ncmpi_put_varm_schar_all(ncid, varid, start, count, stride, - imap, &varT[0][0]); CHECK_ERR + if (format == NC_FORMAT_NETCDF4 || format == NC_FORMAT_NETCDF4_CLASSIC) { + if (coll_io) + err = ncmpi_put_varm_schar_all(ncid, varid, start, count, stride, imap, &varT[0][0]); + else + err = ncmpi_put_varm_schar(ncid, varid, start, count, stride, imap, &varT[0][0]); + CHECK_ERR nerrs += check_write_contents(&varT[0][0]); } else { /* test nonblocking API */ err = ncmpi_iput_varm_schar(ncid, varid, start, count, stride, imap, &varT[0][0], &req); CHECK_ERR - err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR + if (coll_io) + err = ncmpi_wait_all(ncid, 1, &req, &status); + else + err = ncmpi_wait(ncid, 1, &req, &status); + CHECK_ERR err = status; CHECK_ERR nerrs += check_write_contents(&varT[0][0]); /* test when stride == NULL and imap != NULL */ err = ncmpi_iput_varm_schar(ncid, varid, start, count, NULL, imap, &varT[0][0], &req); CHECK_ERR - err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR + if (coll_io) + err = ncmpi_wait_all(ncid, 1, &req, &status); + else + err = ncmpi_wait(ncid, 1, &req, &status); + CHECK_ERR err = status; CHECK_ERR nerrs += check_write_contents(&varT[0][0]); /* test blocking API */ - err = ncmpi_put_varm_schar_all(ncid, varid, start, count, stride, imap, - &varT[0][0]); CHECK_ERR + if (coll_io) + err = ncmpi_put_varm_schar_all(ncid, varid, start, count, stride, imap, &varT[0][0]); + else + err = ncmpi_put_varm_schar(ncid, varid, start, count, stride, imap, &varT[0][0]); + CHECK_ERR nerrs += check_write_contents(&varT[0][0]); /* test when stride == NULL and imap != NULL */ - err = ncmpi_put_varm_schar_all(ncid, varid, start, count, NULL, imap, - &varT[0][0]); CHECK_ERR + if (coll_io) + err = ncmpi_put_varm_schar_all(ncid, varid, start, count, NULL, imap, &varT[0][0]); + else + err = ncmpi_put_varm_schar(ncid, varid, start, count, NULL, imap, &varT[0][0]); + CHECK_ERR nerrs += check_write_contents(&varT[0][0]); } @@ -240,69 +319,27 @@ tst_fmt(char *filename, int cmode) } /*----< main() >------------------------------------------------------------*/ -int main(int argc, char **argv) -{ - char filename[256], *hint_value; - int err, nerrs=0, rank, nprocs, bb_enabled=0; - - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for get/put varm ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } +int main(int argc, char **argv) { -#ifdef DEBUG - if (nprocs > 1 && rank == 0) - printf("Warning: %s is designed to run on 1 process\n", argv[0]); -#endif + int err; + loop_opts opt; - /* check whether burst buffering is enabled */ - if (inq_env_hint("nc_burst_buf", &hint_value)) { - if (strcasecmp(hint_value, "enable") == 0) bb_enabled = 1; - free(hint_value); - } + MPI_Init(&argc, &argv); - nerrs += tst_fmt(filename, 0); - nerrs += tst_fmt(filename, NC_64BIT_OFFSET); - if (!bb_enabled) { -#ifdef ENABLE_NETCDF4 - nerrs += tst_fmt(filename, NC_NETCDF4); - nerrs += tst_fmt(filename, NC_NETCDF4 | NC_CLASSIC_MODEL); -#endif - } - nerrs += tst_fmt(filename, NC_64BIT_DATA); - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + err = tst_main(argc, argv, "get/put varm", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/tst_chunk_nonblocking.c b/test/testcases/tst_chunk_nonblocking.c new file mode 100644 index 0000000000..6cf7ce866d --- /dev/null +++ b/test/testcases/tst_chunk_nonblocking.c @@ -0,0 +1,107 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * This program tests chunking feature when using nonblocking APIs and one of + * the processes makes no call to the API. + * + * Contributed by Danqing Wu. + * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include +#include +#include +#include /* basename() */ + +#include +#include +#include + + +#define DIM_LEN 8 + +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ + int err, nerrs=0, ncid, dimid, varid, rank, req, verbose=0; + int vals[DIM_LEN] = {-1, -2, -3, -4, -5, -6, -7, -8}; + MPI_Offset start, count; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + /* enable chunking */ + MPI_Info_set(info, "nc_chunking", "enable"); + + /* chunking is supported only when MPI-IO driver is used */ + MPI_Info_set(info, "nc_pncio", "disable"); + + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR + + err = ncmpi_def_dim(ncid, "x", DIM_LEN, &dimid); + CHECK_ERR + err = ncmpi_def_var(ncid, "var", NC_INT, 1, &dimid, &varid); + CHECK_ERR + + err = ncmpi_enddef(ncid); + CHECK_ERR + + if (rank == 0) + { + start = 0; + count = DIM_LEN; + err = ncmpi_iput_vara_int(ncid, varid, &start, &count, vals, &req); + CHECK_ERR + } + else + req = NC_REQ_NULL; + + if (verbose) printf("rank = %d, before ncmpi_wait_all\n", rank); + err = ncmpi_wait_all(ncid, 1, &req, NULL); + CHECK_ERR + if (verbose) printf("rank = %d, after ncmpi_wait_all\n", rank); + + err = ncmpi_close(ncid); + + return nerrs; +} + +int main(int argc, char **argv) +{ + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 0; /* test intra-node aggregation */ + opt.drv = 0; /* test PNCIO driver */ + opt.ind = 0; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 0; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "chunking using nonblocking APIs", opt, test_io); + + MPI_Finalize(); + + return err; +} diff --git a/test/testcases/tst_data_move.c b/test/testcases/tst_data_move.c new file mode 100644 index 0000000000..1dff95d2d2 --- /dev/null +++ b/test/testcases/tst_data_move.c @@ -0,0 +1,629 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +/* + * This program checks whether the data movement subroutines are called in the + * following scenario. + * 1. There is a sufficiently large free space in the header extent and fix-sized + * variable section, such that all successive adding new variables will not + * change the starting offsets of both fix-sized and record variable sections. + * 2. Add new fix-sized variables should not call the data movement subroutines. + * 3. When there is only one record, i.e. unlimited dimension size == 1, adding + * a new record variable should not call the data movement subroutines. + * 4. When there are two records, i.e. unlimited dimension size == 2, adding a + * new a new record variable SHOULD call the data movement subroutines, but + * only to move the 2nd record to a higher offset, and by moving the entire + * record, not individual record variables separately. + */ + +#include +#include +#include +#include /* strcasecmp() */ +#include /* basename() */ +#include +#include + +#include + +static int debug; + +#define LON 100 +#define LAT 100 +#define NVARS 10 + +#define PRINT_VAR_OFF \ + for (i=0; i 0 && old_var_off[i] != new_var_off[i]) { \ + printf("Error: %s var %d old offset %6lld != new offset %6lld\n", \ + kind, i, old_var_off[i], new_var_off[i]); \ + nerrs++; \ + goto err_out; \ + } \ + if (debug && rank == 0) \ + printf("nvars=%d: %s var %d old offset %6lld new offset %6lld\n", \ + nvars, kind, i, old_var_off[i], new_var_off[i]); \ + old_var_off[i] = new_var_off[i]; \ + } + + +static +int read_back_check(int ncid, + int coll_io, + MPI_Offset *start, + MPI_Offset *count) +{ + char name[64]; + int i, j, k, err, nerrs=0, rank, nvars, ndims, tdim, *int_buf; + float *flt_buf; + MPI_Offset nelems, nrecords; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + nelems = count[1] * count[2]; + int_buf = (int*) malloc(sizeof(int) * nelems); + flt_buf = (float*) malloc(sizeof(float) * nelems); + + err = ncmpi_inq_nvars(ncid, &nvars); + CHECK_ERR + + err = ncmpi_inq_dimid(ncid, "time", &tdim); + CHECK_ERR + + err = ncmpi_inq_dimlen(ncid, tdim, &nrecords); + CHECK_ERR + + if (debug && rank == 0) printf("number of records=%lld\n",nrecords); + + for (i=0; i 0) { + printf("Error at rank %d line %d: read_back_check() failed\n",rank, __LINE__); + nerrs++; + goto err_out; + } + + fix_vars_size = 0; + for (i=0; i 0) { + printf("Error at rank %d line %d: read_back_check() failed\n",rank, __LINE__); + nerrs++; + goto err_out; + } + + if (debug && rank == 0) { + MPI_Offset fix_off, rec_off; + err = ncmpi_inq_varoffset(ncid, varid_1st_fix, &fix_off); + CHECK_ERR + printf("Line %d: fix_nvars=%d rec_nvars=%d \n", __LINE__,fix_nvars,rec_nvars); + printf("Line %d: first fix-sized var offset %6lld\n", __LINE__,fix_off); + err = ncmpi_inq_varoffset(ncid, varid_1st_rec, &rec_off); + CHECK_ERR + printf("Line %d: sum of all fix-sized vars %6lld\n", __LINE__,fix_vars_size); + printf("Line %d: fix-sized var free space %6lld\n", __LINE__, + rec_off - (fix_off + fix_vars_size)); + printf("Line %d: 1st record offset %6lld\n", __LINE__,rec_off); + } + + /* enter define mode and add a new record variable */ + err = ncmpi_redef(ncid); CHECK_ERR + + if (debug && rank == 0) { + printf("\n===================================================\n"); + printf("%s at %d: rec_nvars=%d adding a new record variable\n", + fname,__LINE__,rec_nvars); + } + + /* add a new record variable */ + err = ncmpi_def_var(ncid, "snow", NC_FLOAT, 3, dimids, &varid[5]); + CHECK_ERR + nvars++; + rec_nvars++; + + err = ncmpi_enddef(ncid); CHECK_ERR + + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + + err = ncmpi_inq_header_size(ncid, &h_size); CHECK_ERR + err = ncmpi_inq_header_extent(ncid, &h_extent); CHECK_ERR + err = ncmpi_inq_recsize(ncid, &rec_size); CHECK_ERR + if (debug && rank == 0) { + printf("%s at %d: after adding a new record variable\n", + fname,__LINE__); + printf("\t\t\theader size=%lld extent=%lld rec_size=%lld\n", + h_size, h_extent,rec_size); + } + + start[0] = 0; + for (i=0; i 0) { + printf("Error at rank %d line %d: read_back_check() failed\n",rank, __LINE__); + nerrs++; + goto err_out; + } + + if (debug && rank == 0) { + MPI_Offset fix_off, rec_off; + err = ncmpi_inq_varoffset(ncid, varid_1st_fix, &fix_off); + CHECK_ERR + printf("Line %d: fix_nvars=%d rec_nvars=%d \n", __LINE__,fix_nvars,rec_nvars); + printf("Line %d: first fix-sized var offset %6lld\n", __LINE__,fix_off); + err = ncmpi_inq_varoffset(ncid, varid_1st_rec, &rec_off); + CHECK_ERR + printf("Line %d: sum of all fix-sized vars %6lld\n", __LINE__,fix_vars_size); + printf("Line %d: fix-sized var free space %6lld\n", __LINE__, + rec_off - (fix_off + fix_vars_size)); + printf("Line %d: 1st record offset %6lld\n", __LINE__,rec_off); + printf("Line %d: 2nd record offset %6lld\n", __LINE__,rec_off+rec_size); + } + + /* write 2nd record */ + if (debug && rank == 0) + printf("\n%s at %d: increment to 2 records\n", fname,__LINE__); + + start[0] = 1; + + for (i=0; i 0) { + printf("Error at rank %d line %d: read_back_check() failed\n",rank, __LINE__); + nerrs++; + goto err_out; + } + + if (debug && rank == 0) { + MPI_Offset fix_off, rec_off; + err = ncmpi_inq_varoffset(ncid, varid_1st_fix, &fix_off); + CHECK_ERR + printf("Line %d: fix_nvars=%d rec_nvars=%d \n", __LINE__,fix_nvars,rec_nvars); + printf("Line %d: first fix-sized var offset %6lld\n", __LINE__,fix_off); + err = ncmpi_inq_varoffset(ncid, varid_1st_rec, &rec_off); + CHECK_ERR + printf("Line %d: sum of all fix-sized vars %6lld\n", __LINE__,fix_vars_size); + printf("Line %d: fix-sized var free space %6lld\n", __LINE__, + rec_off - (fix_off + fix_vars_size)); + printf("Line %d: 1st record offset %6lld\n", __LINE__,rec_off); + printf("Line %d: 2nd record offset %6lld\n", __LINE__,rec_off+rec_size); + } + +err_out: + err = ncmpi_close(ncid); CHECK_ERR + + if (int_buf != NULL) free(int_buf); + if (flt_buf != NULL) free(flt_buf); + return nerrs; +} + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 5432; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "growing data section", opt, test_io); + + MPI_Finalize(); + + return err; +} diff --git a/test/testcases/tst_def_var_fill.c b/test/testcases/tst_def_var_fill.c index 88c98ec6b7..af604519b5 100644 --- a/test/testcases/tst_def_var_fill.c +++ b/test/testcases/tst_def_var_fill.c @@ -31,21 +31,21 @@ #define NX 5 static int -tst_fmt(char *filename, int cmode) +tst_fmt(const char *out_path, int format, int coll_io, MPI_Info info) { int i, j, rank, nprocs, err, nerrs=0; - int ncid, format, varid[2], dimid[2], expect, *buf; + int ncid, fmt, varid[2], dimid[2], expect, *buf; MPI_Offset start[2], count[2]; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - /* allocate I/O buffer */ - buf = (int*) malloc(sizeof(int) * NY*NX); + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR /* create a new file for writing ------------------------------------*/ - cmode |= NC_CLOBBER; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR /* define dimension */ err = ncmpi_def_dim(ncid, "Y", NY, &dimid[0]); CHECK_ERR @@ -61,6 +61,11 @@ tst_fmt(char *filename, int cmode) err = ncmpi_enddef(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + #ifdef STRONGER_CONSISTENCY err = ncmpi_sync(ncid); CHECK_ERR MPI_Barrier(MPI_COMM_WORLD); @@ -68,6 +73,8 @@ tst_fmt(char *filename, int cmode) #endif /* initialize I/O buffer */ + buf = (int*) malloc(sizeof(int) * NY*NX); + for (i=0; i 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for def_var_fill ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + nerrs = tst_fmt(out_path, format, coll_io, info); - /* check whether burst buffering is enabled */ - if (inq_env_hint("nc_burst_buf", &hint_value)) { - if (strcasecmp(hint_value, "enable") == 0) bb_enabled = 1; - free(hint_value); - } + return nerrs; +} - nerrs += tst_fmt(filename, 0); - nerrs += tst_fmt(filename, NC_64BIT_OFFSET); - if (!bb_enabled) { -#ifdef ENABLE_NETCDF4 - nerrs += tst_fmt(filename, NC_NETCDF4); - nerrs += tst_fmt(filename, NC_NETCDF4 | NC_CLASSIC_MODEL); -#endif - } - nerrs += tst_fmt(filename, NC_64BIT_DATA); - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } +int main(int argc, char **argv) { - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + int err; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "def_var_fill", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/tst_del_attr.c b/test/testcases/tst_del_attr.c index 29c48e2d84..e70084e5d8 100644 --- a/test/testcases/tst_del_attr.c +++ b/test/testcases/tst_del_attr.c @@ -39,8 +39,12 @@ } \ } -static int -tst_fmt(char *filename, int cmode) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { char *attr_name[12] = {"attr_NC_NAT", "attr_NC_BYTE", @@ -60,17 +64,22 @@ tst_fmt(char *filename, int cmode) MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (cmode == 0 || cmode == NC_64BIT_OFFSET || cmode & NC_CLASSIC_MODEL) + if (format == NC_FORMAT_CLASSIC || + format == NC_FORMAT_64BIT_OFFSET || + format == NC_FORMAT_NETCDF4_CLASSIC) max_type = NC_DOUBLE; else max_type = NC_UINT64; for (i=0; i<1024; i++) buf[i]=0; + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + for (i=NC_BYTE; i<=max_type; i++) { /* create a new file (or truncate it to 0 length) */ - cmode |= NC_CLOBBER; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR if (i == NC_CHAR) { for (j=0; j<3; j++) buf[j]='a'+j; @@ -84,7 +93,7 @@ tst_fmt(char *filename, int cmode) err = ncmpi_close(ncid); CHECK_ERR /* reopen the file */ - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_WRITE, MPI_INFO_NULL, &ncid); CHECK_ERR + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_WRITE, info, &ncid); CHECK_ERR err = ncmpi_redef(ncid); CHECK_ERR err = ncmpi_del_att(ncid, NC_GLOBAL, attr_name[i]); @@ -93,7 +102,7 @@ tst_fmt(char *filename, int cmode) /* call enddef to recalculate the header size */ err = ncmpi_enddef(ncid); CHECK_ERR - if (!(cmode & NC_NETCDF4)) { + if (format != NC_FORMAT_NETCDF4 && format != NC_FORMAT_NETCDF4_CLASSIC) { /* obtained updated header size */ err = ncmpi_inq_header_size(ncid, &header_size); CHECK_ERR } @@ -111,7 +120,7 @@ tst_fmt(char *filename, int cmode) off_t file_size; /* remove file type prefix substring */ - char *fname = remove_file_system_type_prefix(filename); + char *fname = remove_file_system_type_prefix(out_path); int fd = open(fname, O_RDONLY, 0666); @@ -123,75 +132,39 @@ tst_fmt(char *filename, int cmode) /* obtain file size */ file_size = lseek(fd, 0, SEEK_END); - if (!(cmode & NC_NETCDF4) && file_size != header_size) + if (format != NC_FORMAT_NETCDF4 && format != NC_FORMAT_NETCDF4_CLASSIC && + file_size != header_size) printf("Warning: expected file size "OFFFMT" but got %lld\n", header_size, (long long)file_size); close(fd); } } + return nerrs; } -int main(int argc, char* argv[]) -{ - char filename[256], *hint_value; - int err, nerrs=0, rank, bb_enabled=0; +int main(int argc, char **argv) { - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for testing delete attr ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + int err; + loop_opts opt; - /* check whether burst buffering is enabled */ - if (inq_env_hint("nc_burst_buf", &hint_value)) { - if (strcasecmp(hint_value, "enable") == 0) bb_enabled = 1; - free(hint_value); - } + MPI_Init(&argc, &argv); - nerrs += tst_fmt(filename, 0); - nerrs += tst_fmt(filename, NC_64BIT_OFFSET); - if (!bb_enabled) { -#ifdef ENABLE_NETCDF4 - nerrs += tst_fmt(filename, NC_NETCDF4); - nerrs += tst_fmt(filename, NC_NETCDF4 | NC_CLASSIC_MODEL); -#endif - } - nerrs += tst_fmt(filename, NC_64BIT_DATA); - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 0; /* test hint romio_no_indep_rw */ + opt.chk = 1; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + err = tst_main(argc, argv, "delete attr", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + return err; } - diff --git a/test/testcases/tst_dimsizes.c b/test/testcases/tst_dimsizes.c index 441afc1a70..b04d3a7ed4 100644 --- a/test/testcases/tst_dimsizes.c +++ b/test/testcases/tst_dimsizes.c @@ -48,109 +48,94 @@ * MPI_Offset is a signed long long. */ -int -main(int argc, char **argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - char filename[256]; - int rank, nprocs, err, nerrs=0; - int ncid, dimid; + int err, nerrs=0, fmt, ncid, dimid; MPI_Offset dimsize; - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for defining max dimension sizes ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - - /* Writing Max Dimension Size For NC_CLASSIC */ - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, MPI_INFO_NULL, &ncid); CHECK_ERR - dimsize = DIMMAXCLASSIC; - err = ncmpi_def_dim(ncid, "testdim", dimsize, &dimid); CHECK_ERR + /* Writing Max Dimension Size */ + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR dimsize = -1; err = ncmpi_def_dim(ncid, "testdim1", dimsize, &dimid); EXP_ERR(NC_EDIMSIZE) - dimsize = (MPI_Offset)DIMMAXCLASSIC+1; - err = ncmpi_def_dim(ncid, "testdim1", dimsize, &dimid); EXP_ERR(NC_EDIMSIZE) - err = ncmpi_close(ncid); CHECK_ERR - /* Reading Max Dimension Size For NC_CLASSIC */ - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOCLOBBER, MPI_INFO_NULL, &ncid); CHECK_ERR - err = ncmpi_inq_dimid(ncid, "testdim", &dimid); CHECK_ERR - err = ncmpi_inq_dimlen(ncid, dimid, &dimsize); CHECK_ERR - if (dimsize != DIMMAXCLASSIC) { - printf("Error at line %d in %s: expecting dimsize %d but got "OFFFMT"\n", __LINE__,__FILE__,DIMMAXCLASSIC,dimsize); - nerrs++; + if (format == NC_FORMAT_CLASSIC) { + dimsize = DIMMAXCLASSIC; + err = ncmpi_def_dim(ncid, "testdim", dimsize, &dimid); CHECK_ERR + dimsize = (MPI_Offset)DIMMAXCLASSIC+1; + err = ncmpi_def_dim(ncid, "testdim1", dimsize, &dimid); EXP_ERR(NC_EDIMSIZE) + } + else if (format == NC_FORMAT_64BIT_OFFSET) { + dimsize = DIMMAX64OFFSET; + err = ncmpi_def_dim(ncid, "testdim", dimsize, &dimid); CHECK_ERR + dimsize = (MPI_Offset)DIMMAX64OFFSET+1; + err = ncmpi_def_dim(ncid, "testdim1", dimsize, &dimid); EXP_ERR(NC_EDIMSIZE) + } + else { + dimsize = DIMMAX64DATA; + err = ncmpi_def_dim(ncid, "testdim", dimsize, &dimid); CHECK_ERR + dimsize = -1; + err = ncmpi_def_dim(ncid, "testdim1", dimsize, &dimid); EXP_ERR(NC_EDIMSIZE) } - err = ncmpi_close(ncid); CHECK_ERR - /* Writing Max Dimension Size For NC_64BIT_OFFSET */ - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER | NC_64BIT_OFFSET, MPI_INFO_NULL, &ncid); CHECK_ERR - dimsize = DIMMAX64OFFSET; - err = ncmpi_def_dim(ncid, "testdim", dimsize, &dimid); CHECK_ERR - dimsize = -1; - err = ncmpi_def_dim(ncid, "testdim1", dimsize, &dimid); EXP_ERR(NC_EDIMSIZE) - dimsize = (MPI_Offset)DIMMAX64OFFSET+1; - err = ncmpi_def_dim(ncid, "testdim1", dimsize, &dimid); EXP_ERR(NC_EDIMSIZE) err = ncmpi_close(ncid); CHECK_ERR - /* Reading Max Dimension Size For NC_64BIT_OFFSET */ - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOCLOBBER, MPI_INFO_NULL, &ncid); CHECK_ERR + /* Reading Max Dimension Size */ + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_NOCLOBBER, info, &ncid); CHECK_ERR + err = ncmpi_inq_format(ncid, &fmt); CHECK_ERR err = ncmpi_inq_dimid(ncid, "testdim", &dimid); CHECK_ERR err = ncmpi_inq_dimlen(ncid, dimid, &dimsize); CHECK_ERR - if (dimsize != DIMMAX64OFFSET) { - printf("Error at line %d in %s: expecting dimsize %d but got "OFFFMT"\n", __LINE__,__FILE__,DIMMAX64OFFSET,dimsize); + if (format == NC_FORMAT_CLASSIC && dimsize != DIMMAXCLASSIC) { + printf("Error at line %d in %s: expecting dimsize %d but got "OFFFMT"\n", + __LINE__,__FILE__,DIMMAXCLASSIC,dimsize); nerrs++; } - err = ncmpi_close(ncid); CHECK_ERR - - /* Writing Max Dimension Size For NC_64BIT_DATA */ - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER | NC_64BIT_DATA, MPI_INFO_NULL, &ncid); CHECK_ERR - dimsize = DIMMAX64DATA; - err = ncmpi_def_dim(ncid, "testdim", dimsize, &dimid); CHECK_ERR - dimsize = -1; - err = ncmpi_def_dim(ncid, "testdim1", dimsize, &dimid); EXP_ERR(NC_EDIMSIZE) - err = ncmpi_close(ncid); CHECK_ERR - - /* Reading Max Dimension Size For NC_64BIT_DATA */ - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOCLOBBER, MPI_INFO_NULL, &ncid); CHECK_ERR - err = ncmpi_inq_dimid(ncid, "testdim", &dimid); CHECK_ERR - err = ncmpi_inq_dimlen(ncid, dimid, &dimsize); CHECK_ERR - if (dimsize != DIMMAX64DATA) { - printf("Error at line %d in %s: expecting dimsize %lld but got "OFFFMT"\n", __LINE__,__FILE__,(long long)DIMMAX64DATA,dimsize); + else if (format == NC_FORMAT_64BIT_OFFSET && dimsize != DIMMAX64OFFSET) { + printf("Error at line %d in %s: expecting dimsize %d but got "OFFFMT"\n", + __LINE__,__FILE__,DIMMAX64OFFSET,dimsize); nerrs++; } + else if (format == NC_FORMAT_64BIT_DATA && dimsize != DIMMAX64DATA) { + printf("Error at line %d in %s: expecting dimsize %lld but got "OFFFMT"\n", + __LINE__,__FILE__,(long long)DIMMAX64DATA,dimsize); + nerrs++; + } + err = ncmpi_close(ncid); CHECK_ERR - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "defining max dimension sizes", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/testcases/tst_free_comm.c b/test/testcases/tst_free_comm.c index 966bd9705e..1dadc4e603 100644 --- a/test/testcases/tst_free_comm.c +++ b/test/testcases/tst_free_comm.c @@ -23,110 +23,67 @@ #include -static int -tst_fmt(char *fname, int cmode) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) { - int nerrs=0, err, exp_err=NC_NOERR, ncid; + int nerrs=0, err, ncid; MPI_Comm comm=MPI_COMM_NULL; - MPI_Info info=MPI_INFO_NULL; - -#ifndef ENABLE_NETCDF4 - if (cmode & NC_NETCDF4) - exp_err = NC_ENOTBUILT; -#endif /* duplicate MPI_COMM_WORLD */ MPI_Comm_dup(MPI_COMM_WORLD, &comm); - /* create MPI I/O hints */ - MPI_Info_create(&info); - - if (! (cmode & NC_NETCDF4)) - /* this hint may cause H5Fflush() to hang */ - MPI_Info_set(info, "romio_no_indep_rw", "true"); + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR /* create a file */ - cmode |= NC_CLOBBER; - err = ncmpi_create(comm, fname, cmode, info, &ncid); EXP_ERR(exp_err) - if (err == NC_ENOTBUILT) goto fn_exit; + err = ncmpi_create(comm, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR MPI_Comm_free(&comm); comm = MPI_COMM_NULL; - MPI_Info_free(&info); info = MPI_INFO_NULL; err = ncmpi_close(ncid); CHECK_ERR - /* open the file */ - /* duplicate MPI_COMM_WORLD */ MPI_Comm_dup(MPI_COMM_WORLD, &comm); - /* create MPI I/O hints */ - MPI_Info_create(&info); - MPI_Info_set(info, "romio_no_indep_rw", "true"); - - err = ncmpi_open(comm, fname, NC_NOWRITE, info, &ncid); CHECK_ERR + /* open the file */ + err = ncmpi_open(comm, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR MPI_Comm_free(&comm); comm = MPI_COMM_NULL; - MPI_Info_free(&info); info = MPI_INFO_NULL; err = ncmpi_close(ncid); CHECK_ERR -fn_exit: if (comm != MPI_COMM_NULL) MPI_Comm_free(&comm); - if (info != MPI_INFO_NULL) MPI_Info_free(&info); + return nerrs; } -/*----< main() >------------------------------------------------------------*/ int main(int argc, char **argv) { - char filename[256]; - int err, nerrs=0, rank, nprocs; + int err; + loop_opts opt; MPI_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for freeing MPI communicator ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - - nerrs += tst_fmt(filename, 0); - nerrs += tst_fmt(filename, NC_64BIT_OFFSET); - nerrs += tst_fmt(filename, NC_NETCDF4); - nerrs += tst_fmt(filename, NC_NETCDF4|NC_CLASSIC_MODEL); - nerrs += tst_fmt(filename, NC_64BIT_DATA); - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } - - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 0; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "freeing MPI communicator", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/tst_grow_data.c b/test/testcases/tst_grow_data.c new file mode 100644 index 0000000000..77a5b882ac --- /dev/null +++ b/test/testcases/tst_grow_data.c @@ -0,0 +1,452 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +/* + * This program tests adding new fix-sized and record variables by re-entering + * the define mode, without growing file header. + */ + +#include +#include +#include +#include /* strcasecmp() */ +#include /* basename() */ +#include +#include + +#include + +static int debug; + +#define LON 100 +#define LAT 100 +#define NVARS 10 + +#define PRINT_VAR_OFF \ + for (i=0; i 0) { + printf("Error at rank %d line %d: read_back_check() failed\n",rank, __LINE__); + nerrs++; + goto err_out; + } + + for (i=0; i 0) { + printf("Error at rank %d line %d: read_back_check() failed\n",rank, __LINE__); + nerrs++; + goto err_out; + } + + /* enter define mode and add a new record variable */ + err = ncmpi_redef(ncid); CHECK_ERR + + /* add a new record variable */ + err = ncmpi_def_var(ncid, "snow", NC_FLOAT, 3, dimids, &varid[5]); + CHECK_ERR + nvars++; + + err = ncmpi_enddef(ncid); CHECK_ERR + + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + + err = ncmpi_inq_header_size(ncid, &h_size); CHECK_ERR + err = ncmpi_inq_header_extent(ncid, &h_extent); CHECK_ERR + if (debug && rank == 0) + printf("%s at %d: header size=%lld extent=%lld\n", fname,__LINE__, + h_size, h_extent); + + start[0] = 0; + for (i=0; i 0) { + printf("Error at rank %d line %d: read_back_check() failed\n",rank, __LINE__); + nerrs++; + goto err_out; + } + + err = ncmpi_close(ncid); CHECK_ERR + + for (i=0; i<2; i++) + if (int_buf[i] != NULL) free(int_buf[i]); + for (i=0; i<4; i++) + if (flt_buf[i] != NULL) free(flt_buf[i]); + +err_out: + return nerrs; +} + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 4096; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "growing data section", opt, test_io); + + MPI_Finalize(); + + return err; +} diff --git a/test/testcases/tst_grow_header.c b/test/testcases/tst_grow_header.c index 76b72343e6..ad8f750457 100644 --- a/test/testcases/tst_grow_header.c +++ b/test/testcases/tst_grow_header.c @@ -50,6 +50,7 @@ static int verbose; static int check_vars(MPI_Comm comm, int ncid, + int coll_io, MPI_Offset *start, MPI_Offset *count) { @@ -68,7 +69,10 @@ check_vars(MPI_Comm comm, err = ncmpi_inq_vardimid(ncid, id, dimids); CHECK_ERROUT if (dimids[0] == rec_dim) continue; - err = ncmpi_get_vara_int_all(ncid, id, start+1, count+1, buf); + if (coll_io) + err = ncmpi_get_vara_int_all(ncid, id, start+1, count+1, buf); + else + err = ncmpi_get_vara_int(ncid, id, start+1, count+1, buf); CHECK_ERR for (j=0; j 0) { \ printf("Error at line %d in %s: variable contents unexpected\n", \ __LINE__,__FILE__ ); \ @@ -181,19 +188,22 @@ check_vars(MPI_Comm comm, } #define WRITE_FIX_VAR(id) { \ for (i=0; i 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "tst_grow_header.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for header grow ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - if (verbose) printf("\n"); - } - cmode[0] = 0; - cmode[1] = NC_64BIT_OFFSET; - cmode[2] = NC_64BIT_DATA; + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; - for (i=0; i<3; i++) { - nerrs += tst_fmt(filename, cmode[i]); - if (nerrs > 0) goto main_exit; - } + MPI_Init(&argc, &argv); - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 4096; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ -main_exit: - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + err = tst_main(argc, argv, "header grow", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/tst_info.c b/test/testcases/tst_info.c index a3080f78ee..9d4fb01460 100644 --- a/test/testcases/tst_info.c +++ b/test/testcases/tst_info.c @@ -65,29 +65,23 @@ int check_pnetcdf_hints(int ncid) return nerrs; } -int main(int argc, char** argv) { - char filename[256], value[MPI_MAX_INFO_VAL]; +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* not used */ + MPI_Info global_info) /* not used */ +{ + char value[MPI_MAX_INFO_VAL]; int ncid1, ncid2, rank, err, nerrs=0, len, flag, varid; MPI_Offset header_size, header_extent, expect; MPI_Info info, info_used; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for merging env info ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR /* create a new file and keep it opened to make ncmpii_mem_root not NULL */ err = ncmpi_create(MPI_COMM_WORLD, "dummy", NC_CLOBBER, MPI_INFO_NULL, &ncid1); CHECK_ERR @@ -110,7 +104,7 @@ int main(int argc, char** argv) { MPI_Info_set(info, "nc_var_align_size", "197"); /* size in bytes */ /* create another new file using a non-NULL MPI info --------------------*/ - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, info, &ncid2); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid2); CHECK_ERR MPI_Info_free(&info); @@ -206,7 +200,7 @@ int main(int argc, char** argv) { } /* re-open the file and get the MPI info object */ - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid1); CHECK_ERR + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_NOWRITE, MPI_INFO_NULL, &ncid1); CHECK_ERR /* retrieve MPI info object and check if all PnetCDF recognizable hints are * present */ @@ -214,24 +208,31 @@ int main(int argc, char** argv) { err = ncmpi_close(ncid1); CHECK_ERR - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 0; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 0; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "merging env info", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/tst_inq_header_size.c b/test/testcases/tst_inq_header_size.c new file mode 100644 index 0000000000..be90e8b0d1 --- /dev/null +++ b/test/testcases/tst_inq_header_size.c @@ -0,0 +1,259 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +/* + * This program tests a call to ncmpi_inq_header_size() when in the define + * mode, which should calculate and return the latest file header size. This + * can be useful for application users to decide how much free space to be + * preserved in the file header section, i.e. by setting argument h_minfree + * and/or v_align when calling ncmpi__enddef(). + */ + +#include +#include +#include +#include /* strcasecmp() */ +#include /* basename() */ +#include +#include + +#include + +static int debug; + +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) +{ + char *str; + int err, nerrs=0, rank, ncid, dimids[2], varid, int_buf; + float flt_buf; + double *dbl_buf; + MPI_Offset old_h_size, old_h_extent, new_h_size, new_h_extent; + + debug = 0; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + /* create a file */ + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); + CHECK_ERR + + flt_buf = 1.234; + err = ncmpi_put_att(ncid, NC_GLOBAL, "_FillValue", NC_FLOAT, 1, &flt_buf); + CHECK_ERR + + err = ncmpi_def_dim(ncid, "X", 10, &dimids[0]); CHECK_ERR + CHECK_ERR + + err = ncmpi_def_var(ncid, "int_var", NC_INT, 1, dimids, &varid); + CHECK_ERR + + err = ncmpi_put_att(ncid, varid, "_FillValue", NC_FLOAT, 1, &flt_buf); + EXP_ERR(NC_EBADTYPE) + + int_buf = 5678; + err = ncmpi_put_att(ncid, varid, "_FillValue", NC_INT, 1, &int_buf); + CHECK_ERR + + err = ncmpi_def_var(ncid, "dbl_var", NC_DOUBLE, 1, dimids, &varid); + CHECK_ERR + + err = ncmpi_def_var(ncid, "short_var", NC_SHORT, 1, dimids, &varid); + CHECK_ERR + + err = ncmpi_set_fill(ncid, NC_FILL, NULL); CHECK_ERR + + err = ncmpi_inq_header_size(ncid, &old_h_size); CHECK_ERR + + err = ncmpi_inq_header_extent(ncid, &old_h_extent); CHECK_ERR + + if (debug && rank == 0) + printf("%s at %d: header size=%lld extent=%lld\n", __FILE__,__LINE__, + old_h_size, old_h_extent); + + if (old_h_extent != 0) { + printf("Error at %d: expect file extent size to be 0 but got %lld\n", + __LINE__, old_h_extent); + nerrs++; + goto err_out; + } + + err = ncmpi_enddef(ncid); CHECK_ERR + + err = ncmpi_inq_header_size(ncid, &new_h_size); CHECK_ERR + + err = ncmpi_inq_header_extent(ncid, &new_h_extent); CHECK_ERR + + if (debug && rank == 0) + printf("%s at %d: header size=%lld extent=%lld\n", __FILE__,__LINE__, + new_h_size, new_h_extent); + + if (new_h_size != old_h_size) { + printf("Error at %d: expect file header size %lld but got %lld\n", + __LINE__, old_h_size, new_h_size); + nerrs++; + goto err_out; + } + + if (new_h_extent <= old_h_extent) { + printf("Error at %d: expect file extent size > %lld but got %lld\n", + __LINE__, old_h_extent, new_h_extent); + nerrs++; + goto err_out; + } + + /* file sync before reading */ + err = ncmpi_sync(ncid); + CHECK_ERR + MPI_Barrier(MPI_COMM_WORLD); + + err = ncmpi_close(ncid); CHECK_ERR + + old_h_size = new_h_size; + old_h_extent = new_h_extent; + + /* open the file */ + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_WRITE, info, &ncid); + CHECK_ERR + + err = ncmpi_inq_header_size(ncid, &new_h_size); CHECK_ERR + + err = ncmpi_inq_header_extent(ncid, &new_h_extent); CHECK_ERR + + if (debug && rank == 0) + printf("%s at %d: header size=%lld extent=%lld\n", __FILE__,__LINE__, + new_h_size, new_h_extent); + + if (new_h_size != old_h_size) { + printf("Error at %d: expect file header size %lld but got %lld\n", + __LINE__, old_h_size, new_h_size); + nerrs++; + goto err_out; + } + + if (new_h_extent != old_h_extent) { + printf("Error at %d: expect file extent size %lld but got %lld\n", + __LINE__, old_h_extent, new_h_extent); + nerrs++; + goto err_out; + } + + old_h_size = new_h_size; + old_h_extent = new_h_extent; + + /* enter define mode and add new a dimension and a variable */ + err = ncmpi_redef(ncid); CHECK_ERR + + str = "new global attribute of text data type"; + err = ncmpi_put_att_text(ncid, NC_GLOBAL, "global_attr", strlen(str), str); + CHECK_ERR + + err = ncmpi_def_dim(ncid, "time", NC_UNLIMITED, &dimids[0]); CHECK_ERR + CHECK_ERR + + err = ncmpi_def_dim(ncid, "Y", 10, &dimids[1]); CHECK_ERR + CHECK_ERR + + err = ncmpi_def_var(ncid, "new_int_var", NC_INT, 2, dimids, &varid); + CHECK_ERR + + dbl_buf = (double*) calloc(16, sizeof(double)); + err = ncmpi_put_att_double(ncid, varid, "attr", NC_DOUBLE, 16, dbl_buf); + CHECK_ERR + free(dbl_buf); + + err = ncmpi_inq_header_size(ncid, &new_h_size); CHECK_ERR + + err = ncmpi_inq_header_extent(ncid, &new_h_extent); CHECK_ERR + + if (debug && rank == 0) + printf("%s at %d: header size=%lld extent=%lld\n", __FILE__,__LINE__, + new_h_size, new_h_extent); + + if (new_h_size <= old_h_size) { + printf("Error at %d: expect file header size > %lld but got %lld\n", + __LINE__, old_h_size, new_h_size); + nerrs++; + goto err_out; + } + + if (new_h_extent != old_h_extent) { + printf("Error at %d: expect file extent size %lld but got %lld\n", + __LINE__, old_h_extent, new_h_extent); + nerrs++; + goto err_out; + } + + if (new_h_size > old_h_extent) + /* header size grows beyond the current file extent size */ + err = ncmpi__enddef(ncid, 0, 512, 0, 0); + else + err = ncmpi_enddef(ncid); + CHECK_ERR + + old_h_size = new_h_size; + old_h_extent = new_h_extent; + + err = ncmpi_inq_header_size(ncid, &new_h_size); CHECK_ERR + + err = ncmpi_inq_header_extent(ncid, &new_h_extent); CHECK_ERR + + if (debug && rank == 0) + printf("%s at %d: header size=%lld extent=%lld\n", __FILE__,__LINE__, + new_h_size, new_h_extent); + + if (new_h_size != old_h_size) { + printf("Error at %d: expect file header size %lld but got %lld\n", + __LINE__, old_h_size, new_h_size); + nerrs++; + goto err_out; + } + + if (new_h_extent < old_h_extent) { + printf("Error at %d: expect file extent size >= %lld but got %lld\n", + __LINE__, old_h_extent, new_h_extent); + nerrs++; + goto err_out; + } + + err = ncmpi_close(ncid); CHECK_ERR + +err_out: + return nerrs; +} + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 1; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "in define mode", opt, test_io); + + MPI_Finalize(); + + return err; +} diff --git a/test/testcases/tst_max_var_dims.c b/test/testcases/tst_max_var_dims.c index a2f256db4a..9a8e17ec92 100644 --- a/test/testcases/tst_max_var_dims.c +++ b/test/testcases/tst_max_var_dims.c @@ -26,43 +26,30 @@ #include -int main(int argc, char** argv) { - char filename[256]; - int rank, nprocs, nerrs=0; - int err, ncid; #if NC_MAX_VAR_DIMS < INT_MAX - int i, varid, *dimid; -#endif - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for checking NC_MAX_VAR_DIMS ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ + int err, nerrs=0, ncid, varid, *dimid; + size_t i; -#if NC_MAX_VAR_DIMS < INT_MAX - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, MPI_INFO_NULL, &ncid); CHECK_ERR + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR /* define dimensions */ dimid = (int*) malloc(sizeof(int) * (NC_MAX_VAR_DIMS+2)); err = ncmpi_def_dim(ncid, "dim0", NC_UNLIMITED, &dimid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "dim1", 1, &dimid[1]); CHECK_ERR - for (i=2; i 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } - - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } -#else - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, MPI_INFO_NULL, &ncid); CHECK_ERR - err = ncmpi_close(ncid); CHECK_ERR - if (rank == 0) printf(SKIP_STR); -#endif + return nerrs; +} + +int main(int argc, char **argv) { + + int err=0; + loop_opts opt; + + MPI_Init(&argc, &argv); + + + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "checking NC_MAX_VAR_DIMS", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} +#else +int main(int argc, char **argv) { + return 0; +} +#endif diff --git a/test/testcases/tst_multi_redefine.c b/test/testcases/tst_multi_redefine.c new file mode 100644 index 0000000000..2dfe6f631f --- /dev/null +++ b/test/testcases/tst_multi_redefine.c @@ -0,0 +1,358 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + * + */ + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * + * This program tests entering define modes multiple times, growing header + * extension causing moving data section to plances of higher offsets, and + * checking the contents of all variables defined. + * + * The compile and run commands are given below. + * + * % mpicc -g -o tst_multi_redefine tst_multi_redefine.c -lpnetcdf + * + * % mpiexec -l -n 4 ./tst_multi_redefine testfile.nc + * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include +#include +#include +#include /* strcasecmp() */ +#include /* basename() */ +#include /* getopt() */ + +#include + +#include + +#define NROUNDS 2 +#define NY 10 +#define NX 10 + +static int verbose; + +typedef unsigned char uchar; +typedef unsigned short ushort; +typedef unsigned int uint; +typedef long long longlong; +typedef unsigned long long ulonglong; + +#define END_DEF { \ + err = ncmpi_enddef(ncid); CHECK_ERROUT \ + if (!coll_io) { \ + err = ncmpi_begin_indep_data(ncid); \ + CHECK_ERR \ + } \ + err = ncmpi_inq_header_size(ncid, &new_hdr_size); CHECK_ERROUT \ + err = ncmpi_inq_header_extent(ncid, &new_hdr_ext); CHECK_ERROUT \ + if (verbose && rank == 0) { \ + printf("Add var %2d: header size grows from %4lld to %4lld\n", \ + k, old_hdr_size, new_hdr_size); \ + if (new_hdr_ext > old_hdr_ext) \ + printf("Add var %2d: header extension grows from %4lld to %4lld\n", \ + k, old_hdr_ext, new_hdr_ext); \ + } \ + old_hdr_size = new_hdr_size; \ + old_hdr_ext = new_hdr_ext; \ +} + +#define EXP_VAL(nn) ((j + nn + rank) % NC_MAX_BYTE) + +#define DEF_VAR(xtype) { \ + err = ncmpi_redef(ncid); CHECK_ERROUT \ + sprintf(str, "fix_var_%d", k); \ + err = ncmpi_def_var(ncid, str, xtype, 2, dimids+1, &fix_varids[k]); \ + CHECK_ERROUT \ + sprintf(str, "attribute of fix-sized variable %d", k); \ + err = ncmpi_put_att_text(ncid, fix_varids[k], "attr", strlen(str), str); \ + CHECK_ERROUT \ + sprintf(str, "rec_var_%d", k); \ + err = ncmpi_def_var(ncid, str, xtype, 3, dimids, &rec_varids[k]); \ + CHECK_ERROUT \ + sprintf(str, "attribute of record variable %d", k); \ + err = ncmpi_put_att_text(ncid, rec_varids[k], "attr", strlen(str), str); \ + CHECK_ERROUT \ + END_DEF \ +} + +#define PUT_BUF_CHAR { \ + char *buf = (char*) malloc(sizeof(char) * nelems); \ + for (j=0; j------------------------------------------------------*/ @@ -146,13 +147,14 @@ void* thread_func(void *arg) { char filename[512]; int i, id, nprocs, cmode, err=0, nerrs=0, ncid, *ret, dimid[2], varid[2]; - int *ibuf; + int *ibuf, coll_io; double *dbuf; MPI_Offset start[2], count[2]; MPI_Info info; /* make a unique file name for each thread */ id = ((thread_arg*)arg)->id; + coll_io = ((thread_arg*)arg)->coll_io; sprintf(filename, "%s.%d", ((thread_arg*)arg)->fname, id); /* allocate I/O buffers and initialize their contents */ @@ -184,22 +186,43 @@ void* thread_func(void *arg) err = ncmpi_enddef(ncid); CHECK_ERR /* now we are in data mode */ + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* write a record to the record variable */ start[0] = 0; /* first record */ start[1] = 0; count[0] = 1; count[1] = NX; - err = ncmpi_put_vara_int_all(ncid, varid[0], start, count, ibuf); CHECK_ERR + if (coll_io) + err = ncmpi_put_vara_int_all(ncid, varid[0], start, count, ibuf); + else + err = ncmpi_put_vara_int(ncid, varid[0], start, count, ibuf); + CHECK_ERR /* write another record to the record variable */ start[0] = 2; /* third record */ start[1] = 0; count[0] = 1; count[1] = NX; - err = ncmpi_put_vara_int_all(ncid, varid[0], start, count, ibuf); CHECK_ERR + if (coll_io) + err = ncmpi_put_vara_int_all(ncid, varid[0], start, count, ibuf); + else + err = ncmpi_put_vara_int(ncid, varid[0], start, count, ibuf); + CHECK_ERR /* write to the fixed-size variable */ - err = ncmpi_put_var_double_all(ncid, varid[1], dbuf); CHECK_ERR + if (coll_io) + err = ncmpi_put_var_double_all(ncid, varid[1], dbuf); + else + err = ncmpi_put_var_double(ncid, varid[1], dbuf); + CHECK_ERR + + /* file sync before closing file */ + err = ncmpi_sync(ncid); + CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR @@ -219,6 +242,12 @@ void* thread_func(void *arg) sprintf(filename, "%s.%d", ((thread_arg*)arg)->fname, id); err = ncmpi_open(MPI_COMM_SELF, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid); CHECK_ERR + + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + err = ncmpi_inq_varid(ncid, "ivar", &varid[0]); CHECK_ERR err = ncmpi_inq_varid(ncid, "dvar", &varid[1]); CHECK_ERR @@ -228,7 +257,11 @@ void* thread_func(void *arg) start[1] = 0; count[0] = 1; count[1] = NX; - err = ncmpi_get_vara_int_all(ncid, varid[0], start, count, ibuf); CHECK_ERR + if (coll_io) + err = ncmpi_get_vara_int_all(ncid, varid[0], start, count, ibuf); + else + err = ncmpi_get_vara_int(ncid, varid[0], start, count, ibuf); + CHECK_ERR for (i=0; i-------------------------------------------------------------*/ -int main(int argc, char **argv) { - char filename[256]; - int i, err, nerrs=0, rank, providedT; +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ + int i, err, nerrs=0, rank, ncid; thread_arg t_arg[NTHREADS]; /* must be unique to each thread */ - -#ifdef ENABLE_THREAD_SAFE pthread_t threads[NTHREADS]; - MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &providedT); -#else - MPI_Init(&argc, &argv); -#endif MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for thread safety ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + /* initialize thread barrier */ + err = pthread_barrier_init(&barr, NULL, NTHREADS); + ERRNO_HANDLE(err) + + /* create threads, each calls thread_func() */ + for (i=0; i 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; + /* wait for all threads to finish */ + for (i=0; i 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + err = tst_main(argc, argv, "thread safety", opt, test_io); - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } #else - if (rank == 0) printf(SKIP_STR); + MPI_Init(&argc, &argv); #endif -err_out: MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/testcases/tst_redefine.c b/test/testcases/tst_redefine.c index 6f9330079e..8479b4639e 100644 --- a/test/testcases/tst_redefine.c +++ b/test/testcases/tst_redefine.c @@ -1,17 +1,23 @@ /* * Copyright (C) 2024, Northwestern University and Argonne National Laboratory * See COPYRIGHT notice in top-level directory. - * */ /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * This program tests all alignment features available from PnetCDF: - * 1. v_align: header extent alignment (starting file offset of data section) - * 2. h_minfree: header free space - * 3. r_align: record variable section alignment - * 4. v_minfree: free space between record variable section and the end of last - * fix-sized variable. + * 1. h_minfree: free space in the header section, i.e. + * (header extent) - (header size) >= h_minfree + * 2. v_align: alignment of the beginning of the fix-size variable section, i.e. + * (header extent) % v_align == 0 + * If no fixed-size variable is defined, v_align is ignored. + * Default value of v_align is 512. + * 3. v_minfree: free space between the end of last fix-sized variable and the + * record variable section. + * If no fixed-size variable is defined, v_minfree is ignored. + * 4. r_align: alignment of the beginning of the record variable section. + * If no fixed-size variable is defined, default value of r_align is 512. + * Otherwise, default value of r_align is 4. * * Tests are done by reentering the define mode multiple times. * @@ -28,35 +34,47 @@ #include #include /* strcasecmp() */ #include /* basename() */ +#include + #include #include #define LEN 101 +#ifndef MAX +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#endif #define RNDUP(x, unit) ((((x) + (unit) - 1) / (unit)) * (unit)) static int verbose; -#define CHECK_VAL(ncid, varid, ii, val, expect) { \ - if (val != expect) { \ - char name[16]; \ - err = ncmpi_inq_varname(ncid, varid, name); \ - CHECK_ERROUT \ - printf("%s line %d: var %s i=%d expecting %d but got %d\n", \ - __func__,__LINE__,name,ii,expect,val); \ - nerrs++; \ - goto err_out; \ - } \ +#define CHECK_VAL(ncid, varid, ii, val, expect) { \ + if (val != expect) { \ + char name[16]; \ + int ndims; \ + err = ncmpi_inq_varndims(ncid, varid, &ndims); \ + CHECK_ERROUT \ + err = ncmpi_inq_varname(ncid, varid, name); \ + CHECK_ERROUT \ + if (ndims == 1) \ + printf("%s line %d: var %s[%d] expecting %d but got %d\n", \ + __func__,__LINE__,name,ii,expect,val); \ + else /* record variable */ \ + printf("%s line %d: var %s[%d][%d] expecting %d but got %d\n", \ + __func__,__LINE__,name,ii/LEN,ii%LEN,expect,val); \ + nerrs++; \ + goto err_out; \ + } \ } /*----< check_vars() >-------------------------------------------------------*/ /* read back variables from file and check their contents */ static int -check_vars(MPI_Comm comm, int ncid, int *varid) +check_vars(MPI_Comm comm, int ncid, int *varid, int coll_io) { - int i, nerrs=0, err, rank, *buf=NULL, nvars; - MPI_Offset start[2], count[2]; + int i, j, nerrs=0, err, rank, *buf[4], nvars, off_val; + MPI_Offset start[2], count[2], bufLen[4]; MPI_Comm_rank(comm, &rank); @@ -67,42 +85,86 @@ check_vars(MPI_Comm comm, int ncid, int *varid) start[1] = rank * LEN; count[1] = LEN; - buf = (int*) malloc(sizeof(int) * 2 * LEN); + buf[0] = (int*) malloc(sizeof(int) * 2 * LEN * 4); + for (i=0; i<2*LEN*4; i++) buf[0][i] = -1; /* check record variables */ count[0] = 2; - for (i=0; i 0) { \ printf("Error at line %d in %s: check_vars failed\n", \ __LINE__,__FILE__); \ @@ -182,26 +244,41 @@ check_vars(MPI_Comm comm, int ncid, int *varid) #define CHECK_ALIGNMENTS { \ /* hints set in MPI info precede ncmpi__enddef */ \ v_align = (env_v_align) ? env_v_align : \ - (!has_fix_vars && env_r_align) ? env_r_align : \ - (info_v_align) ? info_v_align : v_align; \ + (info_v_align) ? info_v_align : \ + (v_align > 0) ? v_align : 512; \ r_align = (env_r_align) ? env_r_align : \ - (info_r_align) ? info_r_align : r_align; \ + (info_r_align) ? info_r_align : \ + (r_align > 0) ? r_align : \ + (has_fix_vars) ? 4 : 512;\ + if (h_minfree == -1) h_minfree = 0; \ + if (v_minfree == -1) v_minfree = 0; \ exp_hsize = old_hsize + increment; \ - exp_extent = RNDUP(exp_hsize + h_minfree, v_align); \ - old_extent = RNDUP(old_extent, v_align); \ - exp_extent = (exp_extent < old_extent) ? old_extent : exp_extent; \ + exp_extent = exp_hsize + h_minfree; \ if (has_fix_vars) { \ - exp_r_begin = RNDUP(exp_extent + fix_v_size + v_minfree, r_align); \ - old_r_begin = RNDUP(old_r_begin, r_align); \ + exp_extent = MAX(exp_extent, old_extent); \ + exp_extent = RNDUP(exp_extent, v_align); \ + exp_r_begin = exp_extent + fix_v_size + v_minfree; \ + exp_r_begin = MAX(exp_r_begin, old_r_begin); \ + exp_r_begin = RNDUP(exp_r_begin, r_align); \ + } else { \ + exp_r_begin = MAX(exp_extent, old_r_begin); \ + exp_r_begin = RNDUP(exp_r_begin, r_align); \ + exp_extent = exp_r_begin; \ } \ - else /* v_minfree and r_align are ignored */ \ - exp_r_begin = exp_extent; \ - exp_r_begin = (exp_r_begin < old_r_begin) ? old_r_begin : exp_r_begin; \ - exp_h_free = exp_extent - exp_hsize; \ - exp_v_free = exp_r_begin - (exp_extent + fix_v_size); \ + exp_h_free = exp_extent - exp_hsize; \ + exp_v_free = exp_r_begin - (exp_extent + fix_v_size); \ CHECK_HEADER_SIZE \ } +#define PRINT_HINTS \ + if (verbose && rank == 0) { \ + printf("\n========================================\n"); \ + printf(" Line %d hsize %lld extent %lld r_begin %lld has_fix_vars %d\n", \ + __LINE__,hsize,extent,r_begin, has_fix_vars); \ + printf(" Line %d ncmpi__enddef() increment %lld h_minfree %lld v_align %lld v_minfree %lld r_align %lld\n", \ + __LINE__, increment, h_minfree, v_align, v_minfree, r_align); \ + } + /* test alignments hints * 1. set in environment variable PNETCDF_HINTS, * 2. set in ncmpi__enddef() @@ -209,22 +286,23 @@ check_vars(MPI_Comm comm, int ncid, int *varid) * Note precedence of hints: PNETCDF_HINTS > ncmpi__enddef() > MPI info. */ static int -tst_fmt(char *filename, - int cmode, +tst_fmt(const char *out_path, + int coll_io, + MPI_Info global_info, int has_fix_vars, MPI_Offset *env_align, /* [3] 0 means unset in PNETCDF_HINTS */ MPI_Offset *info_align) /* [3] 0 means unset in MPI info */ { int i, rank, nprocs, ncid, err, nerrs=0; - int *buf, dimid[3], varid[4]; + int *buf[4], dimid[3], varid[4]; MPI_Info info=MPI_INFO_NULL; - MPI_Offset start[2], count[2], increment, fix_v_size; + MPI_Offset bufLen[4], start[2], count[2], increment, fix_v_size; - MPI_Offset hsize=0, old_hsize, exp_hsize; - MPI_Offset extent=0, old_extent, exp_extent; + MPI_Offset hsize=0, old_hsize=-1, exp_hsize=-1; + MPI_Offset extent=0, old_extent=-1, exp_extent=-1; MPI_Offset h_free=0, old_h_free, exp_h_free; MPI_Offset v_free=0, old_v_free, exp_v_free; - MPI_Offset r_begin=0, old_r_begin, exp_r_begin; + MPI_Offset r_begin=0, old_r_begin=-1, exp_r_begin=-1; MPI_Offset h_minfree, v_align, v_minfree, r_align; MPI_Offset env_h_align=0, env_v_align=0, env_r_align=0; MPI_Offset info_h_align=0, info_v_align=0, info_r_align=0; @@ -234,9 +312,11 @@ tst_fmt(char *filename, MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &nprocs); + MPI_Info_dup(global_info, &info); + if (verbose && rank == 0) - printf("---- cmode=%d has_fix_vars=%d env_align=%s info_align=%s\n", - cmode,has_fix_vars,(env_align==NULL)?"NULL":"SET", + printf("---- has_fix_vars=%d env_align=%s info_align=%s\n", + has_fix_vars,(env_align==NULL)?"NULL":"SET", (info_align==NULL)?"NULL":"SET"); if (env_align != NULL) { @@ -250,7 +330,6 @@ tst_fmt(char *filename, info_h_align = info_align[0]; /* 0 means unset in MPI info */ info_v_align = info_align[1]; /* 0 means unset in MPI info */ info_r_align = info_align[2]; /* 0 means unset in MPI info */ - MPI_Info_create(&info); if (info_h_align) { sprintf(str, OFFFMT, info_h_align); MPI_Info_set(info, "nc_header_align_size", str); @@ -266,14 +345,12 @@ tst_fmt(char *filename, if (info_v_align == 0) info_v_align = info_h_align; } if (verbose && rank == 0) - printf("---- cmode=%d has_fix_vars=%d env_align="OFFFMT" "OFFFMT" "OFFFMT" info_align="OFFFMT" "OFFFMT" "OFFFMT"\n", - cmode,has_fix_vars,env_h_align,env_v_align,env_r_align, + printf("---- has_fix_vars=%d env_align="OFFFMT" "OFFFMT" "OFFFMT" info_align="OFFFMT" "OFFFMT" "OFFFMT"\n", + has_fix_vars,env_h_align,env_v_align,env_r_align, info_h_align,info_v_align,info_r_align); - /* create a new file */ - cmode |= NC_CLOBBER; - err = ncmpi_create(comm, filename, cmode, info, &ncid); CHECK_ERR + err = ncmpi_create(comm, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "time", NC_UNLIMITED, &dimid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "dim", LEN*nprocs, &dimid[1]); CHECK_ERR @@ -287,53 +364,96 @@ tst_fmt(char *filename, } err = ncmpi_put_att_text(ncid, NC_GLOBAL, "attr", 0, NULL); CHECK_ERR + increment = 0; h_minfree = v_minfree = v_align = r_align = -1; + PRINT_HINTS err = ncmpi_enddef(ncid); CHECK_ERR + buf[0] = (int*) malloc(sizeof(int) * 2 * LEN * 4); + + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* write to all variables, 2 records */ start[0] = 0; start[1] = rank * LEN; count[0] = 2; count[1] = LEN; - buf = (int*) malloc(sizeof(int) * count[0] * count[1]); + bufLen[0] = count[0]*count[1]; + for (i=0; i 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "tst_redefine.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for header alignment ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - if (verbose) printf("\n"); - } - cmode[0] = 0; - cmode[1] = NC_64BIT_OFFSET; - cmode[2] = NC_64BIT_DATA; + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + /* retrieve value of environment variable PNETCDF_HINTS */ + saved_env = getenv("PNETCDF_HINTS"); + if (verbose && rank == 0 && saved_env != NULL) + printf("PNETCDF_HINTS=%s\n",saved_env); + + /* No alignment hints should be set in the environment variable + * PNETCDF_HINTS (there can be other kinds) and the info object passed into + * this subroutine (there can be other hints) before running this test + * program. Check seq_runs.sh and parallel_run.sh first before running this + * test program. + */ for (has_fix_vars=1; has_fix_vars>=0; has_fix_vars--) { - /* No hints set in environment variable PNETCDF_HINTS. - * No hints set in MPI Info object. - */ - unsetenv("PNETCDF_HINTS"); - for (i=0; i<3; i++) { - nerrs += tst_fmt(filename, cmode[i], has_fix_vars, NULL, NULL); - if (nerrs > 0) goto main_exit; - } + /* Test when there is no alignment hints set at all */ + nerrs += tst_fmt(out_path, coll_io, info, has_fix_vars, NULL, NULL); + if (nerrs > 0) goto err_out; + + /* Test when there is no alignment hints set in environment variable + * PNETCDF_HINTS and set alignment hints in MPI Info object. + */ info_align[0] = 28; /* 7 x 4 */ info_align[1] = 44; /* 11 x 4 */ info_align[2] = 52; /* 13 x 4 */ - /* No hints set in environment variable PNETCDF_HINTS. - * Hints set in MPI Info object. - */ - for (i=0; i<3; i++) { - nerrs += tst_fmt(filename, cmode[i], has_fix_vars, NULL, - info_align); - if (nerrs > 0) goto main_exit; - } + nerrs += tst_fmt(out_path, coll_io, info, has_fix_vars, NULL, info_align); + if (nerrs > 0) goto err_out; + /* Set hints in environment variable PNETCDF_HINTS, but no hints set in + * MPI Info object. + */ env_align[0] = 68; /* 17 x 4 */ env_align[1] = 76; /* 19 x 4 */ env_align[2] = 92; /* 23 x 4 */ @@ -566,23 +731,20 @@ int main(int argc, char** argv) env_align[0], env_align[1], env_align[2]); setenv("PNETCDF_HINTS", str, 1); - /* Set hints in environment variable PNETCDF_HINTS. - * No hints set in MPI Info object. - */ - for (i=0; i<3; i++) { - nerrs += tst_fmt(filename, cmode[i], has_fix_vars, env_align, NULL); - if (nerrs > 0) goto main_exit; - } + nerrs += tst_fmt(out_path, coll_io, info, has_fix_vars, env_align, NULL); + if (nerrs > 0) goto err_out; - /* Set hints in environment variable PNETCDF_HINTS. - * Set hints in MPI Info object. + /* Test if the alignment hints set in environment variable + * PNETCDF_HINTS take precedence over hints set in MPI Info object, when + * Hints are both set in environment variable PNETCDF_HINTS and in MPI + * Info object. */ - for (i=0; i<3; i++) { - nerrs += tst_fmt(filename, cmode[i], has_fix_vars, env_align, - info_align); - if (nerrs > 0) goto main_exit; - } + nerrs += tst_fmt(out_path, coll_io, info, has_fix_vars, env_align, info_align); + if (nerrs > 0) goto err_out; + /* Test a different set of alignment hints set in environment variable + * PNETCDF_HINTS. + */ env_align[0] = 68; /* 17 x 4 */ env_align[1] = 0; env_align[2] = 92; /* 23 x 4 */ @@ -590,15 +752,12 @@ int main(int argc, char** argv) env_align[0], env_align[2]); setenv("PNETCDF_HINTS", str, 1); - /* Set hints in environment variable PNETCDF_HINTS. - * No hints set in MPI Info object. - */ - for (i=0; i<3; i++) { - nerrs += tst_fmt(filename, cmode[i], has_fix_vars, env_align, - info_align); - if (nerrs > 0) goto main_exit; - } + nerrs += tst_fmt(out_path, coll_io, info, has_fix_vars, env_align, info_align); + if (nerrs > 0) goto err_out; + /* Test a different set of alignment hints set in environment variable + * PNETCDF_HINTS. + */ env_align[0] = 0; env_align[1] = 76; /* 19 x 4 */ env_align[2] = 92; /* 23 x 4 */ @@ -606,65 +765,64 @@ int main(int argc, char** argv) env_align[1], env_align[2]); setenv("PNETCDF_HINTS", str, 1); - /* Set hints in environment variable PNETCDF_HINTS. - * No hints set in MPI Info object. - */ - for (i=0; i<3; i++) { - nerrs += tst_fmt(filename, cmode[i], has_fix_vars, env_align, - info_align); - if (nerrs > 0) goto main_exit; - } + nerrs += tst_fmt(out_path, coll_io, info, has_fix_vars, env_align, info_align); + if (nerrs > 0) goto err_out; + /* Test a different set of alignment hints set in environment variable + * PNETCDF_HINTS. + */ env_align[0] = 0; env_align[1] = 76; /* 19 x 4 */ env_align[2] = 0; sprintf(str, "nc_var_align_size="OFFFMT"\n", env_align[1]); setenv("PNETCDF_HINTS", str, 1); - /* Set hints in environment variable PNETCDF_HINTS. - * No hints set in MPI Info object. - */ - for (i=0; i<3; i++) { - nerrs += tst_fmt(filename, cmode[i], has_fix_vars, env_align, - info_align); - if (nerrs > 0) goto main_exit; - } + nerrs += tst_fmt(out_path, coll_io, info, has_fix_vars, env_align, info_align); + if (nerrs > 0) goto err_out; + /* Test a different set of alignment hints set in environment variable + * PNETCDF_HINTS. + */ env_align[0] = 0; /* 17 x 4 */ env_align[1] = 0; env_align[2] = 92; /* 23 x 4 */ sprintf(str, "nc_record_align_size="OFFFMT"\n", env_align[2]); setenv("PNETCDF_HINTS", str, 1); - /* Set hints in environment variable PNETCDF_HINTS. - * No hints set in MPI Info object. - */ - for (i=0; i<3; i++) { - nerrs += tst_fmt(filename, cmode[i], has_fix_vars, env_align, - info_align); - if (nerrs > 0) goto main_exit; - } - } + nerrs += tst_fmt(out_path, coll_io, info, has_fix_vars, env_align, info_align); + if (nerrs > 0) goto err_out; - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); + /* restore the original value set in environment variable PNETCDF_HINTS */ + if (saved_env != NULL) setenv("PNETCDF_HINTS", saved_env, 1); + else unsetenv("PNETCDF_HINTS"); } -main_exit: - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +err_out: + return nerrs; +} + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 300; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "header alignment", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/tst_symlink.c b/test/testcases/tst_symlink.c index 828035bc25..a5b1279676 100644 --- a/test/testcases/tst_symlink.c +++ b/test/testcases/tst_symlink.c @@ -32,36 +32,23 @@ } \ } -int main(int argc, char **argv) { - char *filename, *symlink_fname, *fname; - int err, nerrs=0, len, rank, ncid, verbose=0; +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) +{ + char *symlink_fname, *fname; + int err, nerrs=0, rank, ncid, verbose=0; struct stat statbuf; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) filename = strdup(argv[1]); - else filename = strdup("testfile.nc"); - len = (int)strlen(filename) + 1; - MPI_Bcast(&len, 1, MPI_INT, 0, MPI_COMM_WORLD); - MPI_Bcast(filename, len, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for NC_CLOBBER on symlink file ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - /* remove file system type prefix substring */ - fname = remove_file_system_type_prefix(filename); + fname = remove_file_system_type_prefix(out_path); - symlink_fname = (char*) malloc(strlen(filename) + 10); + symlink_fname = (char*) malloc(strlen(out_path) + 10); /* create a regular file and a symbolic link to it */ err = 0; @@ -91,17 +78,18 @@ int main(int argc, char **argv) { sync(); } MPI_Bcast(&err, 1, MPI_INT, 0, MPI_COMM_WORLD); - if (err != 0) { - nerrs++; - goto fn_exit; - } + if (err != 0) return 1; MPI_Barrier(MPI_COMM_WORLD); /* symlink_fname may have file system type prefix */ - sprintf(symlink_fname, "%s.symlink", filename); + sprintf(symlink_fname, "%s.symlink", out_path); + + /* Set file format */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR /* create a file in NC_CLOBBER mode */ - err = ncmpi_create(MPI_COMM_WORLD, symlink_fname, NC_CLOBBER, MPI_INFO_NULL, &ncid); + err = ncmpi_create(MPI_COMM_WORLD, symlink_fname, NC_CLOBBER, info, &ncid); CHECK_ERR err = ncmpi_close(ncid); CHECK_ERR @@ -130,27 +118,33 @@ int main(int argc, char **argv) { sync(); } - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } - - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } - free(filename); free(symlink_fname); -fn_exit: - MPI_Finalize(); - return (nerrs > 0); + return nerrs; } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 0; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 0; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "NC_CLOBBER on symlink file", opt, test_io); + + MPI_Finalize(); + + return err; +} diff --git a/test/testcases/tst_varn_var1.c b/test/testcases/tst_varn_var1.c new file mode 100644 index 0000000000..50e4422e78 --- /dev/null +++ b/test/testcases/tst_varn_var1.c @@ -0,0 +1,242 @@ +/********************************************************************* + * + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + * + *********************************************************************/ + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * This example tests a single call of ncmpi_put_varn_int_all() to write a + * sequence of requests with arbitrary array indices, all with length == 1. + * + * The compile and run commands are given below, together with an ncmpidump of + * the output file. + * + * % mpicc -O2 -o tst_varn_var1 tst_varn_var1.c -lpnetcdf + * % mpiexec -n 4 ./tst_varn_var1 /pvfs2/wkliao/testfile.nc + * % ncmpidump /pvfs2/wkliao/testfile.nc + * netcdf testfile { + * // file format: CDF-5 (big variables) + * dimensions: + * Y = 4 ; + * X = 10 ; + * time = UNLIMITED ; // (4 currently) + * variables: + * int fix_var(Y, X) ; + * int rec_var(time, X) ; + * data: + * + * fix_var = + * 0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _, + * 0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _, + * 0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _, + * 0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _ ; + * + * rec_var = + * 0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _, + * 0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _, + * 0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _, + * 0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _ ; + * } + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include +#include +#include /* strcpy(), memset() */ +#include /* basename() */ +#include +#include + +#include + +#define NY 40 +#define NX 40 +#define NDIMS 2 + +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ + int i, j, k, rank, nprocs, err, nerrs=0; + int ncid, varid[2], dimid[2], nreqs, req, *buf; + MPI_Offset **starts=NULL; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + buf = (int*) malloc(sizeof(int) * NY * NX); + + nreqs = NY * NX * nprocs; + starts = (MPI_Offset**) malloc(sizeof(MPI_Offset*) * nreqs); + starts[0] = (MPI_Offset*) calloc(nreqs * NDIMS, sizeof(MPI_Offset)); + for (i=1; i 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for strided put with fill mode on", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + foreach(`itype', (schar,short,int,float,double), ` + _CAT(`nerrs += test_vars_',itype)'`(out_path, coll_io, info);') - /* check whether burst buffering is enabled */ - if (inq_env_hint("nc_burst_buf", &hint_value)) { - if (strcasecmp(hint_value, "enable") == 0) bb_enabled = 1; - free(hint_value); + if (format == NC_FORMAT_CDF5 || format == NC_FORMAT_NETCDF4) { + foreach(`itype', (uchar,ushort,uint,longlong,ulonglong), ` + _CAT(`nerrs += test_vars_',itype)'`(out_path, coll_io, info);') } - ncmpi_set_default_format(NC_FORMAT_CLASSIC, NULL); - foreach(`itype', (schar,short,int,float,double), ` - _CAT(`nerrs += test_vars_',itype)'`(filename);') + return nerrs; +} - ncmpi_set_default_format(NC_FORMAT_CDF2, NULL); - foreach(`itype', (schar,short,int,float,double), ` - _CAT(`nerrs += test_vars_',itype)'`(filename);') +int main(int argc, char **argv) { - if (!bb_enabled) { -#ifdef ENABLE_NETCDF4 - ncmpi_set_default_format(NC_FORMAT_NETCDF4_CLASSIC, NULL); - foreach(`itype', (schar,short,int,float,double), ` - _CAT(`nerrs += test_vars_',itype)'`(filename);') + int err; + loop_opts opt; - ncmpi_set_default_format(NC_FORMAT_NETCDF4, NULL); - foreach(`itype', (schar,uchar,short,ushort,int,uint,float,double,longlong,ulonglong), ` - _CAT(`nerrs += test_vars_',itype)'`(filename);') -#endif - } + MPI_Init(&argc, &argv); - ncmpi_set_default_format(NC_FORMAT_CDF5, NULL); - foreach(`itype', (schar,uchar,short,ushort,int,uint,float,double,longlong,ulonglong), ` - _CAT(`nerrs += test_vars_',itype)'`(filename);') - - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + opt.num_fmts = sizeof(nc_formats) / sizeof(int); + opt.formats = nc_formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + err = tst_main(argc, argv, "put vars with fill mode on", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/testcases/tst_version.c b/test/testcases/tst_version.c index eee7d65fbb..4379bd2792 100644 --- a/test/testcases/tst_version.c +++ b/test/testcases/tst_version.c @@ -2,7 +2,7 @@ * Copyright (C) 2019, Northwestern University and Argonne National Laboratory * See COPYRIGHT notice in top-level directory. * - * Check whether PnetCDF version string returned from ncmpi_inq_libvers() + * Check whether PnetCDF version string returned from ncmpi_inq_libvers() * matches the constant PNETCDF_VERSION defined in header file pnetcdf.h. * */ @@ -15,21 +15,15 @@ #include -/*----< main() >------------------------------------------------------------*/ -int main(int argc, char **argv) +static +int test_io(const char *out_path, /* ignored */ + const char *in_path, /* ignored */ + int format, /* ignored */ + int coll_io, /* ignored */ + MPI_Info info) /* ignored */ { char *str, *pnetcdf_version_str; - int err, nerrs=0, rank; - - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for PnetCDF library version ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } + int nerrs=0; str = (char*) malloc(strlen(ncmpi_inq_libvers())+1); strcpy(str, ncmpi_inq_libvers()); @@ -47,24 +41,32 @@ int main(int argc, char **argv) } free(str); - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + int formats[] = {0}; + + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 0; /* test intra-node aggregation */ + opt.drv = 0; /* test PNCIO driver */ + opt.ind = 0; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 0; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 0; /* run ncmpidiff for file header only */ + opt.var_diff = 0; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "PnetCDF library version", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/varn_contig.c b/test/testcases/varn_contig.c index 164d717af0..f74780fe2d 100644 --- a/test/testcases/varn_contig.c +++ b/test/testcases/varn_contig.c @@ -70,41 +70,31 @@ int check_contents_for_fail(int *buffer) return 0; } -int main(int argc, char** argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) { - char filename[256]; int i, rank, nprocs, err, nerrs=0; - int ncid, cmode, varid[3], dimid[2], num_reqs, *buffer, *r_buffer; + int ncid, varid[3], dimid[2], num_reqs, *buffer, *r_buffer; MPI_Offset w_len, **starts=NULL, **counts=NULL; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for put_varn with contig fileview", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - #ifdef DEBUG if (nprocs != 4 && rank == 0) printf("Warning: %s is intended to run on 4 processes\n",argv[0]); #endif + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR /* create a global array of size NY * NX */ @@ -125,6 +115,11 @@ int main(int argc, char** argv) err = ncmpi_sync(ncid); CHECK_ERR + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* pick arbitrary numbers of requests for 4 processes */ num_reqs = 0; if (rank == 0) num_reqs = 4; @@ -202,21 +197,34 @@ int main(int argc, char** argv) for (i=0; i 4) MPI_Barrier(MPI_COMM_WORLD); /* read back and check contents */ memset(r_buffer, 0, NY*NX*sizeof(int)); - err = ncmpi_get_var_int_all(ncid, varid[0], r_buffer); + if (coll_io) + err = ncmpi_get_var_int_all(ncid, varid[0], r_buffer); + else + err = ncmpi_get_var_int(ncid, varid[0], r_buffer); CHECK_ERR nerrs += check_contents_for_fail(r_buffer); @@ -232,24 +240,33 @@ int main(int argc, char** argv) free(counts); } - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } + +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "put_varn with contig fileview", opt, test_io); MPI_Finalize(); - return (nerrs > 0); -} + return err; +} diff --git a/test/testcases/varn_int.c b/test/testcases/varn_int.c index f824690867..4a32c67379 100644 --- a/test/testcases/varn_int.c +++ b/test/testcases/varn_int.c @@ -23,11 +23,11 @@ * X = 10 ; * REC_DIM = UNLIMITED ; // (4 currently) * variables: - * int var(Y, X) ; + * int fix_var(Y, X) ; * int rec_var(REC_DIM, X) ; * data: * - * var = + * fix_var = * 13, 13, 13, 11, 11, 10, 10, 12, 11, 11, * 10, 12, 12, 12, 13, 11, 11, 12, 12, 12, * 11, 11, 12, 13, 13, 13, 10, 10, 11, 11, @@ -55,9 +55,9 @@ #define NDIMS 2 static -int check_contents_for_fail(int *buffer) +int check_contents_for_fail(char *var_name, int *buffer) { - int i, nprocs; + int i, err=0, nprocs; int expected[NY*NX] = {13, 13, 13, 11, 11, 10, 10, 12, 11, 11, 10, 12, 12, 12, 13, 11, 11, 12, 12, 12, 11, 11, 12, 13, 13, 13, 10, 10, 11, 11, @@ -67,14 +67,17 @@ int check_contents_for_fail(int *buffer) /* check if the contents of buf are expected */ for (i=0; i= nprocs) continue; + if (expected[i] >= nprocs+10) continue; if (buffer[i] != expected[i]) { - printf("Expected read buf[%d]=%d, but got %d\n", - i,expected[i],buffer[i]); - return 1; + printf("Error: var %s expect read buf[%d]=%d, but got %d\n", + var_name, i,expected[i],buffer[i]); + err = 1; + break; } } - return 0; + MPI_Allreduce(MPI_IN_PLACE, &err, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); + + return (err > 0); } static @@ -87,41 +90,23 @@ void permute(MPI_Offset a[NDIMS], MPI_Offset b[NDIMS]) } } -int main(int argc, char** argv) +#define INDEP_MODE 0 +#define COLL_MODE 1 + +static +int tst_io(const char *filename, + int coll_io, + MPI_Info info) { - char filename[256]; int i, j, rank, nprocs, err, nerrs=0; - int ncid, cmode, varid[3], dimid[2], num_reqs, *buffer, *r_buffer; + int ncid, varid[2], dimid[2], num_reqs=0, *buffer=NULL, *r_buffer=NULL; MPI_Offset w_len, **starts=NULL, **counts=NULL; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for ncmpi_put_varn_int_all() ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - -#ifdef DEBUG - if (nprocs != 4 && rank == 0) - printf("Warning: %s is intended to run on 4 processes\n",argv[0]); -#endif - /* create a new file for writing ----------------------------------------*/ - cmode = NC_CLOBBER | NC_64BIT_DATA; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, info, &ncid); CHECK_ERR /* create a global array of size NY * NX */ @@ -129,7 +114,7 @@ int main(int argc, char** argv) CHECK_ERR err = ncmpi_def_dim(ncid, "X", NX, &dimid[1]); CHECK_ERR - err = ncmpi_def_var(ncid, "var", NC_INT, NDIMS, dimid, &varid[0]); + err = ncmpi_def_var(ncid, "fix_var", NC_INT, NDIMS, dimid, &varid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "REC_DIM", NC_UNLIMITED, &dimid[0]); CHECK_ERR @@ -148,6 +133,11 @@ int main(int argc, char** argv) } } + if (!coll_io) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* pick arbitrary numbers of requests for 4 processes */ num_reqs = 0; if (rank == 0) num_reqs = 4; @@ -231,11 +221,17 @@ int main(int argc, char** argv) for (i=0; i 4) MPI_Barrier(MPI_COMM_WORLD); + + if (nprocs > 4 || !coll_io) { + /* This program was designed to run on 4 processes. If running on more + * than 4, then we need to sync writes before reading, especially for + * processes of rank >= 4. Similarly, when running in independent data + * mode, flushing writes is necessary before reading the data back. + */ + MPI_Barrier(MPI_COMM_WORLD); + err = ncmpi_sync(ncid); + CHECK_ERR + MPI_Barrier(MPI_COMM_WORLD); + } /* read back and check contents */ memset(r_buffer, 0, NY*NX*sizeof(int)); - err = ncmpi_get_var_int_all(ncid, varid[0], r_buffer); + if (coll_io) + err = ncmpi_get_var_int_all(ncid, varid[0], r_buffer); + else + err = ncmpi_get_var_int(ncid, varid[0], r_buffer); CHECK_ERR - nerrs += check_contents_for_fail(r_buffer); + nerrs += check_contents_for_fail("fix_var", r_buffer); if (nerrs > 0) goto err_out; /* permute write order */ @@ -263,7 +273,10 @@ int main(int argc, char** argv) } /* write using varn API */ - err = ncmpi_put_varn_int_all(ncid, varid[1], num_reqs, starts, counts, buffer); + if (coll_io) + err = ncmpi_put_varn_int_all(ncid, varid[1], num_reqs, starts, counts, buffer); + else + err = ncmpi_put_varn_int(ncid, varid[1], num_reqs, starts, counts, buffer); CHECK_ERR /* check if user put buffer contents altered */ @@ -276,21 +289,39 @@ int main(int argc, char** argv) } } + if (nprocs > 4 || !coll_io) { + /* This program was designed to run on 4 processes. If running on more + * than 4, then we need to sync writes before reading, especially for + * processes of rank >= 4. Similarly, when running in independent data + * mode, flushing writes is necessary before reading the data back. + */ + MPI_Barrier(MPI_COMM_WORLD); + err = ncmpi_sync(ncid); + CHECK_ERR + MPI_Barrier(MPI_COMM_WORLD); + } + /* read back using get_var API and check contents */ memset(r_buffer, 0, NY*NX*sizeof(int)); - err = ncmpi_get_var_int_all(ncid, varid[1], r_buffer); + if (coll_io) + err = ncmpi_get_var_int_all(ncid, varid[1], r_buffer); + else + err = ncmpi_get_var_int(ncid, varid[1], r_buffer); CHECK_ERR - nerrs += check_contents_for_fail(r_buffer); + nerrs += check_contents_for_fail("rec_var", r_buffer); if (nerrs > 0) goto err_out; /* read back using get_varn API and check contents */ for (i=0; i 0) { free(starts[0]); free(counts[0]); @@ -336,24 +370,69 @@ int main(int argc, char** argv) free(counts); } - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return nerrs; +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, + MPI_Info info) +{ + int err, nerrs=0; + MPI_Info info_dup; - MPI_Finalize(); - return (nerrs > 0); + MPI_Info_dup(info, &info_dup); + + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + +#ifdef DEBUG + if (nprocs != 4 && rank == 0) + printf("Warning: %s is intended to run on 4 processes\n", + basename(__FILE__)); +#endif + + nerrs = tst_io(out_path, coll_io, info_dup); + if (nerrs > 0) goto err_out; + + /* disable PnetCDF internal buffering */ + MPI_Info_set(info, "nc_ibuf_size", "0"); + + nerrs = tst_io(out_path, coll_io, info_dup); + if (nerrs > 0) goto err_out; + +err_out: + MPI_Info_free(&info_dup); + + return nerrs; } +int main(int argc, char **argv) { + + int err; + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 1; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "ncmpi_put_varn_int_all()", opt, test_io); + + MPI_Finalize(); + + return err; +} diff --git a/test/testcases/varn_intf.f b/test/testcases/varn_intf.f index b1dc33f84e..db9ce6d7a1 100644 --- a/test/testcases/varn_intf.f +++ b/test/testcases/varn_intf.f @@ -67,8 +67,8 @@ subroutine check(err, message) ! It is a good idea to check returned value for possible error if (err .NE. NF_NOERR) then write(6,*) message(1:XTRIM(message)), nfmpi_strerror(err) - msg = '*** TESTING F77 varn_intf.f for varn API ' - call pass_fail(1, msg) + msg = '*** TESTING F77 varn_intf.f - varn API ' + call pass_fail(1, msg, 0) STOP 2 end if end ! subroutine check @@ -82,9 +82,10 @@ program main integer*8 NX, NY PARAMETER(NDIMS=2, NX=4, NY=10) - character(LEN=256) filename, cmd, msg + character(LEN=256) out_path, in_path, cmd, msg integer i, j, err, ierr, nprocs, rank, nerrs, get_args integer cmode, ncid, varid, dimid(NDIMS), num_reqs + logical keep_files integer*8 w_len, w_req_len integer*8 starts(NDIMS, 13) @@ -92,22 +93,29 @@ program main integer*8 malloc_size, sum_size integer buffer(13) integer old_fillmode + double precision timing call MPI_Init(ierr) + + timing = MPI_Wtime() + call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any if (rank .EQ. 0) then - filename = "testfile.nc" - err = get_args(cmd, filename) + out_path = "testfile.nc" + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, + MPI_COMM_WORLD, ierr) + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, + + ierr) + nerrs = 0 if (.FALSE. .AND. nprocs .NE. 4 .AND. rank .EQ. 0) @@ -116,7 +124,7 @@ program main ! create file, truncate it if exists cmode = IOR(NF_CLOBBER, NF_64BIT_DATA) - err = nfmpi_create(MPI_COMM_WORLD, filename, cmode, + err = nfmpi_create(MPI_COMM_WORLD, out_path, cmode, + MPI_INFO_NULL, ncid) call check(err, 'In nfmpi_create: ') @@ -286,10 +294,18 @@ program main + sum_size, ' bytes yet to be freed' endif + timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, + + MPI_DOUBLE_PRECISION, MPI_MAX, + + MPI_COMM_WORLD, ierr) if (rank .eq. 0) then + if (.NOT. keep_files) then + err = nfmpi_delete(out_path, MPI_INFO_NULL) + end if + msg = '*** TESTING F77 '//cmd(1:XTRIM(cmd))// - + ' for varn API ' - call pass_fail(nerrs, msg) + + ' - varn API ' + call pass_fail(nerrs, msg, timing) endif 999 call MPI_Finalize(ierr) diff --git a/test/testcases/varn_real.f90 b/test/testcases/varn_real.f90 index 7ec3a2eb40..02f2a3ef3a 100644 --- a/test/testcases/varn_real.f90 +++ b/test/testcases/varn_real.f90 @@ -42,8 +42,8 @@ subroutine check(err, message) ! It is a good idea to check returned value for possible error if (err .NE. NF90_NOERR) then write(6,*) message, trim(nf90mpi_strerror(err)) - msg = '*** TESTING F90 varn_real.f90 for varn API ' - call pass_fail(1, msg) + msg = '*** TESTING F90 varn_real.f90 - varn API ' + call pass_fail(1, msg, 0) ! call MPI_Abort(MPI_COMM_WORLD, -1, err) STOP 2 end if @@ -57,7 +57,7 @@ program main integer NDIMS PARAMETER(NDIMS=2) - character(LEN=256) filename, cmd, msg + character(LEN=256) out_path, in_path, cmd, msg integer rank, nprocs, err, ierr, num_reqs, get_args integer ncid, cmode, varid, dimid(2), y, x, i, j, nerrs integer old_fillmode @@ -69,23 +69,30 @@ program main integer(kind=MPI_OFFSET_KIND), allocatable :: starts(:,:) integer(kind=MPI_OFFSET_KIND), allocatable :: counts(:,:) integer(kind=MPI_OFFSET_KIND) malloc_size, sum_size + logical keep_files + double precision timing + + call MPI_Init(ierr) + + timing = MPI_Wtime() NY = 4 NX = 10 - call MPI_Init(ierr) call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) - ! take filename from command-line argument if there is any + ! take out_path from command-line argument if there is any if (rank .EQ. 0) then - filename = "testfile.nc" - err = get_args(cmd, filename) + out_path = "testfile.nc" + err = get_args(cmd, out_path, in_path, keep_files) endif call MPI_Bcast(err, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) if (err .EQ. 0) goto 999 - call MPI_Bcast(filename, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + call MPI_Bcast(out_path, 256, MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr) + + call MPI_Bcast(keep_files, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) nerrs = 0 @@ -95,7 +102,7 @@ program main ! create file, truncate it if exists cmode = IOR(NF90_CLOBBER, NF90_64BIT_DATA) - err = nf90mpi_create(MPI_COMM_WORLD, filename, cmode, & + err = nf90mpi_create(MPI_COMM_WORLD, out_path, cmode, & MPI_INFO_NULL, ncid) call check(err, 'In nf90mpi_create: ') @@ -306,9 +313,17 @@ program main sum_size, ' bytes yet to be freed' endif + timing = MPI_Wtime() - timing + call MPI_Allreduce(MPI_IN_PLACE, timing, 1, & + MPI_DOUBLE_PRECISION, MPI_MAX, & + MPI_COMM_WORLD, ierr) if (rank .eq. 0) then - msg = '*** TESTING F90 '//trim(cmd)//' for varn API ' - call pass_fail(nerrs, msg) + if (.NOT. keep_files) then + err = nfmpi_delete(out_path, MPI_INFO_NULL) + end if + + msg = '*** TESTING F90 '//trim(cmd)//' - varn API ' + call pass_fail(nerrs, msg, timing) endif 999 call MPI_Finalize(ierr) diff --git a/test/testcases/vectors.c b/test/testcases/vectors.c index 0b7ef5e717..df550f7778 100644 --- a/test/testcases/vectors.c +++ b/test/testcases/vectors.c @@ -21,7 +21,12 @@ #define BLOCKLEN 3 #define STRIDE 5 -int main(int argc, char ** argv) +static +int test_io(const char *out_path, + const char *in_path, /* ignored */ + int format, + int coll_io, /* ignored */ + MPI_Info info) { int ncid, dimid, varid, rank, nprocs; MPI_Datatype vtype, rtype, usertype; @@ -31,39 +36,29 @@ int main(int argc, char ** argv) int count = 25; double pi = 3.14159; MPI_Offset start, acount; - char filename[256]; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for put_vara/get_vara ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - #ifdef DEBUG if (nprocs > 2 && rank == 0) printf("Warning: %s is designed to run on 1 process\n",argv[0]); #endif - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, MPI_INFO_NULL, &ncid); + /* Set format. */ + err = ncmpi_set_default_format(format, NULL); + CHECK_ERR + + err = ncmpi_create(MPI_COMM_WORLD, out_path, NC_CLOBBER, info, &ncid); CHECK_ERR err = ncmpi_def_dim(ncid, "50k", 1024*50, &dimid); CHECK_ERR err = ncmpi_def_var(ncid, "vector", NC_DOUBLE, 1, &dimid, &varid); CHECK_ERR + err = ncmpi_def_var_fill(ncid, varid, 0, NULL); + CHECK_ERR + err = ncmpi_enddef(ncid); CHECK_ERR @@ -90,10 +85,14 @@ int main(int argc, char ** argv) CHECK_ERR } + /* file sync before reading */ + err = ncmpi_sync(ncid); + CHECK_ERR + err = ncmpi_close(ncid); CHECK_ERR - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid); + err = ncmpi_open(MPI_COMM_WORLD, out_path, NC_NOWRITE, info, &ncid); CHECK_ERR err = ncmpi_begin_indep_data(ncid); CHECK_ERR @@ -120,23 +119,34 @@ int main(int argc, char ** argv) free(userbuf); free(cmpbuf); - /* check if PnetCDF freed all internal malloc */ - MPI_Offset malloc_size, sum_size; - err = ncmpi_inq_malloc_size(&malloc_size); - if (err == NC_NOERR) { - MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); - if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", - sum_size); - if (malloc_size > 0) ncmpi_inq_malloc_list(); - } + return (nerrs > 0); +} - MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - if (rank == 0) { - if (nerrs) printf(FAIL_STR,nerrs); - else printf(PASS_STR); - } +int main(int argc, char **argv) { + + int err; + + /* flexible APIs are not supported in NetCDF4 */ + int formats[] = {NC_FORMAT_CLASSIC, NC_FORMAT_64BIT_OFFSET, NC_FORMAT_64BIT_DATA}; + + loop_opts opt; + + MPI_Init(&argc, &argv); + + opt.num_fmts = sizeof(formats) / sizeof(int); + opt.formats = formats; + opt.ina = 1; /* test intra-node aggregation */ + opt.drv = 1; /* test PNCIO driver */ + opt.ind = 1; /* test hint romio_no_indep_rw */ + opt.chk = 0; /* test hint nc_data_move_chunk_size */ + opt.bb = 1; /* test burst-buffering feature */ + opt.mod = 0; /* test independent data mode */ + opt.hdr_diff = 1; /* run ncmpidiff for file header only */ + opt.var_diff = 1; /* run ncmpidiff for variables */ + + err = tst_main(argc, argv, "put_vara/get_vara", opt, test_io); MPI_Finalize(); - return (nerrs > 0); + + return err; } diff --git a/test/testcases/wrap_runs.sh b/test/testcases/wrap_runs.sh index 280216b4df..80c13401b0 100755 --- a/test/testcases/wrap_runs.sh +++ b/test/testcases/wrap_runs.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # # Copyright (C) 2003, Northwestern University and Argonne National Laboratory # See COPYRIGHT notice in top-level directory. @@ -16,7 +16,7 @@ outfile=`basename $1` OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -26,8 +26,37 @@ fi unset PNETCDF_HINTS for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi +for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + else + USEMPIO_HINTS="nc_pncio=enable" + fi + + if [[ "$1" == *"vard"* ]] ; then + if test "x$mpiio_mode" == x0 ; then + # vard APIs are not supported when using PNCIO + continue + fi + fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.nc ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.nc # echo "" @@ -36,7 +65,7 @@ for j in ${safe_modes} ; do echo "" echo "---- testing burst buffering" - export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" + export PNETCDF_HINTS="$PNETCDF_HINTS;nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.bb.nc unset PNETCDF_HINTS ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.bb.nc @@ -49,6 +78,7 @@ for j in ${safe_modes} ; do fi fi done +done rm -f ${OUTDIR}/$outfile.nc* rm -f ${OUTDIR}/$outfile.bb.nc*