From b44d06beea4e82f201c3e52a2fe23dfef221d740 Mon Sep 17 00:00:00 2001 From: Robert Schade Date: Mon, 8 Apr 2024 13:42:19 +0200 Subject: [PATCH] =?UTF-8?q?added=20dry-run=20option=20to=20convert=20pytho?= =?UTF-8?q?n=20inputs=20for=20latter=20large-scale=20mpi-run,=20additional?= =?UTF-8?q?=20POVM=20ouput=20at=20end=20instead=20of=20after=20every=20sta?= =?UTF-8?q?ge,=20on-node=20compression=20of=20output=20POVMs=20before=20wr?= =?UTF-8?q?iting=20to=20parallel=20file=20system,=20additional=C3=B6=20opt?= =?UTF-8?q?ion=20to=20disable=20benchmark=20of=20underlying=20operations?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 2 +- README.md | 22 +++++++-------- condat_simplexproj.c | 4 +-- job.sh | 44 +++++++++++++++++++++++------ pqdts.f90 | 67 ++++++++++++++++++++++++++++++++++++++------ pqdts.py | 26 ++++++++++++----- 6 files changed, 128 insertions(+), 37 deletions(-) diff --git a/Makefile b/Makefile index 6eda12a..284583d 100755 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ CC = g++ # C++ compiler FCC = gfortran #fortran compiler PFCC = mpif90 #fortran compile with MPI -CFLAGS = -O3 -march=native -fopenmp -g -flto -fno-strict-aliasing +CFLAGS = -O3 -march=native -fopenmp -g -fargument-noalias -ffreestanding -finline-functions -funroll-all-loops -fprefetch-loop-arrays CFFLAGS = -ffree-line-length-none -Werror=aliasing -Werror=ampersand -Werror=c-binding-type -Werror=intrinsic-shadow -Werror=intrinsics-std -Werror=line-truncation -Werror=tabs -Werror=target-lifetime -Werror=underflow -Werror=unused-but-set-variable -Werror=unused-variable -Werror=unused-parameter -Werror=unused-label -Werror=conversion -Werror=zerotrip -Wno-maybe-uninitialized -Wuninitialized -Wuse-without-only -fno-strict-aliasing #don't change the lines below diff --git a/README.md b/README.md index ea95777..e7646aa 100755 --- a/README.md +++ b/README.md @@ -37,19 +37,15 @@ The most convenient way of using the program is with the included Python wrapper This wrapper has the following options: ``` -usage: pqdts.py [-h] -P PMATRIX [-F FMATRIX] [-D DMAX] [-t THREADS] -p - PQDTSPATH [-o OUTPUT] [-e EPSILON] [-g GAMMA] [-m MAXITER] - [-b] [-v] +usage: pqdts.py [-h] -P PMATRIX [-F FMATRIX] [-D DMAX] [-t THREADS] -p PQDTSPATH [-o OUTPUT] [-e EPSILON] [-g GAMMA] [-m MAXITER] [-T] [-b] [-d] [-v] -optional arguments: +options: -h, --help show this help message and exit -P PMATRIX, --Pmatrix PMATRIX - path to npz file (scipy sparse) or npy file (numpy) of - P matrix (dimension D x N) + path to npz file (scipy sparse) or npy file (numpy) of P matrix (dimension D x N) -F FMATRIX, --Fmatrix FMATRIX - path to npz file (scipy sparse) or npy file (numpy) of - F matrix (dimension D x M) - -D DMAX, --Dmax DMAX truncate to D so that P is a D x M matrix + path to npz file (scipy sparse) or npy file (numpy) of F matrix (dimension D x M) + -D DMAX, --Dmax DMAX truncate to D so that P is a D x N matrix -t THREADS, --threads THREADS numper of OpenMP threads to use -p PQDTSPATH, --pqdtspath PQDTSPATH @@ -62,9 +58,12 @@ optional arguments: regularization parameter -m MAXITER, --maxiter MAXITER maximal number of iterations - -b, --benchmark benchmark mode, don't write output POVMs + -T, --timing measure timing for reconstruction, don't write output POVMs + -b, --benchmarkops measure timing for underlying operations + -d, --dryrun dry-run: only prepare inputs for pqdts -v, --verbose be more verbose ``` + The dependencies of the wrapper can be installed with `pip3 install -r requirements.txt`. ### Command Line Arguments @@ -76,12 +75,13 @@ Without the Python wrapper, the command line arguments of `pqdts_omp.x` and `pqd 5. number of non-zero elements in $P$ 6. computation mode: 2 for two-metric projected truncated Newton method, 1 for projected gradient method 7. maxiter: maximal number of iterations in stages -8. output: 0 to disable output of the POVMs, 1 to enable output of POVMs after every minimization stage +8. output: 0 to disable output of the POVMs, 1 to enable output of POVMs at the end, 2 to enable output of POVMs after every minimization stage 9. gamma: regularization parameter $\gamma$ 10. epsilon: value of the convergence criterion 11. index of stage to start with, i.e., 1 or 2 12. 0 to start with the initial guess $\varPi=1/N$, 1 to read the output from a previous run as a starting point 13. smoothing distance factor $N_s$ +14. benchmark underlying operations: 0 for no, 1 for yes ### Inputs Without the Python wrapper, the programs `pqdts_omp.x` and `pqdts_omp_mpi.x` expect inputs ($P$ matrix and optionally the $F$ matrix) in the `data` directory in the current working directory. The following files are expected: diff --git a/condat_simplexproj.c b/condat_simplexproj.c index eaff275..209c1ef 100755 --- a/condat_simplexproj.c +++ b/condat_simplexproj.c @@ -59,7 +59,7 @@ extern "C"{ void simplexproj_condat_(double* y, double* x, int* plength) { int length=plength[0]; - double* aux = (x==y ? (double*)malloc(length*sizeof(double)) : x); + double* aux = x; //(x==y ? (double*)malloc(length*sizeof(double)) : x); double* aux0=aux; int auxlength=1; int auxlengthold=-1; @@ -91,6 +91,6 @@ void simplexproj_condat_(double* y, double* x, int* plength) { } while (auxlength<=auxlengthold); for (i=0; itau ? y[i]-tau : 0.0); - if (x==y) free(aux0); +// if (x==y) free(aux0); } } diff --git a/job.sh b/job.sh index 8fd1d66..4724b95 100644 --- a/job.sh +++ b/job.sh @@ -1,21 +1,49 @@ #!/bin/bash -#SBATCH -t 1-0 -#SBATCH -N 1 -#SBATCH -n 1 +#SBATCH -t 0:10:00 +#SBATCH --mem=230G #SBATCH --exclusive -#SBATCH --cpus-per-task=128 +#SBATCH --ntasks-per-node=8 +#SBATCH --cpus-per-task=16 +#SBATCH -x n2dgx01,n2gpu12[01-32],n2hcn[01-05],n2hacc[01-03],n2cn[1101-1196] +#module reset +#module load perf/OSU-Micro-Benchmarks/5.7.1-gompi-2021b +#srun --ntasks-per-node=1 -n $SLURM_JOB_NUM_NODES osu_mbw_mr > osu.out module reset module load toolchain/foss/2023b module load lang/Python/3.11.5-GCCcore-13.2.0 +module load lib/zstd/1.5.5-GCCcore-13.2.0 source ./pqdts_env/bin/activate +make clean +make pqdts +make pqdts_mpi + export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK export OMP_PLACES=cores export OMP_PROC_BIND=true -D=$1 -mkdir simulated_D_8200_Dmax_${D} -cd simulated_D_8200_Dmax_${D} -python3 ../pqdts.py -P ../../det-tomo/simulation/data/simulatedP_D_8200.npz -t $SLURM_CPUS_PER_TASK --pqdtspath ../pqdts_omp.x -m 1000 -g 0.0 -e 0.000000001 -D $D -v -b > simulated_D_8200_Dmax_${D}.out +D=`echo "sqrt(200000000000/8/6/151*$SLURM_JOB_NUM_NODES)" | bc` +#eps=`echo "0.000001/$D" | bc -l` +eps=`echo "0.0000001/$D" | bc -l` +#eps=1 +dir="simulated_D_260001_Nodes_${SLURM_JOB_NUM_NODES}_Dmax_${D}_$1" +rm -rf $dir +mkdir $dir +cd $dir + +cp ../pqdts_omp_mpi.x . +python ../pqdts.py -P ../simulatedP_D_260001.npz -p ./pqdts_omp_mpi.x -v -b -d -D $D -e $eps > prep.out 2> prep.err + +cmd=`tail -n1 prep.out` +wd=`pwd` +srun --ntasks-per-node=1 -n $SLURM_JOB_NUM_NODES cp pqdts_omp_mpi.x /dev/shm +srun --ntasks-per-node=1 -n $SLURM_JOB_NUM_NODES cp -r data /dev/shm + +cd /dev/shm +srun $cmd > $wd/run.out 2> $wd/run.err + +#compress output +srun --ntasks-per-node=1 -n $SLURM_JOB_NUM_NODES bash $wd/../compress_and_copy.sh $wd + diff --git a/pqdts.f90 b/pqdts.f90 index c5b05a9..817cffb 100755 --- a/pqdts.f90 +++ b/pqdts.f90 @@ -183,7 +183,6 @@ subroutine tprint(iter) write (43, *) iter, timingn(i), timingc(i), timingsum(i) end do call flush (43) - end subroutine subroutine fprob(x, mean, prob) @@ -1064,6 +1063,7 @@ subroutine conv_test(X, DLDX, tprint, conv) call MPI_ALLREDUCE(MPI_IN_PLACE, conv, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD, ierr) #endif conv = sqrt(conv/(real(N, kind=real64)*real(M, kind=real64))) + !conv = sqrt(conv/(real(N, kind=real64))) end subroutine subroutine read_unformatted_binary_from_python(M, N, filename, A) @@ -1202,7 +1202,7 @@ program pqdts real(kind=real64) :: conv, xmin logical :: tproj = .true. logical, parameter :: testder = .false. - logical, parameter :: tbench = .false. + logical :: tbench = .false. real(kind=real64), parameter :: dxp = 1e-7 real(kind=real64) :: tactivetol = 1e-8 real(kind=real64) :: conv_thres = 1e-6 @@ -1214,7 +1214,7 @@ program pqdts integer(kind=int32) :: tactive = 0 logical :: precond = .false. real(kind=real64) :: Fsize = 0, dlalpha, smo_fact - integer(kind=int32) :: output, state, start_stage, rank2, read_input + integer(kind=int32) :: output, state, start_stage, rank2, read_input, bench integer(kind=int64) :: nact integer(kind=int64) :: M2, N2, D2, localrows2 Character(len=255) :: str @@ -1266,6 +1266,12 @@ program pqdts read (arg, *) read_input CALL getarg(13, arg) read (arg, *) smo_fact + CALL getarg(14, arg) + read (arg, *) bench + + if(bench.eq.1)then + tbench=.true. + endif #ifdef MPI call MPI_Init(ierror) @@ -1309,7 +1315,6 @@ program pqdts end if print *, rank, "localrows", localrows0, localrows1, localrows, rowdist(localrows0), rowdist(localrows1) - if (Fl .ge. 0) then !FIXME only read F partially call read_sparse_from_python(Fl, "data/F_row.bin", "data/F_col.bin", "data/F_data.bin", Fi, Fj, Fd) @@ -1531,7 +1536,7 @@ program pqdts else call random_number(P) end if - + allocate (X(localrows, N)) call pset(M, N, 0.0D0, x) allocate (dLdX(localrows, N)) @@ -1594,10 +1599,10 @@ program pqdts #endif t1 = omp_get_wtime() call dL(X, dLdX) - t2 = omp_get_wtime() #ifdef MPI call MPI_Barrier(MPI_COMM_WORLD, ierr) #endif + t2 = omp_get_wtime() if (rank .eq. 0) print *, "t dL", t2 - t1, dLdX(1, 1) end do @@ -2131,8 +2136,11 @@ program pqdts end if end do - call conv_test(X, DLDX, output.ne.0, conv) - if (output .ge. 1) then + call conv_test(X, DLDX, output.ge.3, conv) + if (output .ge. 2) then +#ifdef MPI + call MPI_Barrier(MPI_COMM_WORLD, ierr) +#endif call tstart("output") !export write (outname, '(a,i6,a,i6,a)') "rank_", rank, "_oiter", oiter, ".dat" @@ -2151,11 +2159,51 @@ program pqdts write (42) X close (42) call tstop("output") +#ifdef MPI + call MPI_Barrier(MPI_COMM_WORLD, ierr) +#endif end if call tprint(oiter) end do call tstop("min") + + if (output .ge. 1) then + !some deallcoation to free memory to write to /dev/shm for compression + deallocate(dldx) + deallocate(sn) + deallocate(xtmp) + deallocate(O) + deallocate(z) + deallocate(ik) +#ifdef MPI + call MPI_Barrier(MPI_COMM_WORLD, ierr) +#endif + call tstart("output") + !export + write (outname, '(a,i6,a,i6,a)') "rank_", rank, "_final.dat" + print *, outname + INQUIRE (FILE=outname, EXIST=file_exists) + if (file_exists) then + open (42, file=trim(outname), status='old', FORM='unformatted') + else + open (42, file=trim(outname), status='new', FORM='unformatted') + end if + write (42) M + write (42) N + write (42) D + write (42) localrows + write (42) rank + write (42) X + close (42) + call tstop("output") +#ifdef MPI + call MPI_Barrier(MPI_COMM_WORLD, ierr) +#endif + end if call tprint(-1) +#ifdef MPI + call MPI_Barrier(MPI_COMM_WORLD, ierr) +#endif !get memory usage open (43, file="/proc/self/status", status='old', iostat=state) @@ -2173,6 +2221,9 @@ program pqdts end do close (43) close (44) +#ifdef MPI + call MPI_Barrier(MPI_COMM_WORLD, ierr) +#endif #ifdef MPI call MPI_Finalize(ierror) diff --git a/pqdts.py b/pqdts.py index ceeefd9..3b39d51 100644 --- a/pqdts.py +++ b/pqdts.py @@ -27,7 +27,7 @@ def npz_to_fortran(npz,matname): coodata.tofile('data/'+matname+'_data.bin') return len(coo.data) -def run_pqdts(N,M,D,P,threads,pqdtspath,maxiter=100,tol=1e-6,gamma=0,smo_int=0,smo_dist=0,start_stage=1,read_input=0,smo_fact=0,output=1,verbose=False,F=None): +def run_pqdts(N,M,D,P,threads,pqdtspath,maxiter=100,tol=1e-6,gamma=0,smo_int=0,smo_dist=0,start_stage=1,read_input=0,smo_fact=0,output=1,verbose=False,F=None,dry=False,benchops=0): #set environment variables for OpenMP os.environ["OMP_NUM_THREADS"] = str(threads) os.environ["OMP_PROC_BIND"] = "True" @@ -38,10 +38,14 @@ def run_pqdts(N,M,D,P,threads,pqdtspath,maxiter=100,tol=1e-6,gamma=0,smo_int=0,s if not (F is None): NF=npz_to_fortran(F,"F") #call pqdts - cmd=pqdtspath+" "+str(M)+" "+str(N)+" "+str(D)+" "+str(NF)+" "+str(NP)+" 2 "+str(maxiter)+" "+str(output)+" "+str(gamma)+" "+str(tol)+" "+str(start_stage)+" "+str(read_input)+" "+str(smo_fact) + cmd=pqdtspath+" "+str(M)+" "+str(N)+" "+str(D)+" "+str(NF)+" "+str(NP)+" 2 "+str(maxiter)+" "+str(output)+" "+str(gamma)+" "+str(tol)+" "+str(start_stage)+" "+str(read_input)+" "+str(smo_fact)+" "+str(benchops) + if dry: + print("not executing:") + print(cmd) + return None + if verbose: print("executing:",cmd) - start = time.time() out=subprocess.run(cmd, shell=True, capture_output=True) end = time.time() @@ -93,7 +97,9 @@ def run_pqdts(N,M,D,P,threads,pqdtspath,maxiter=100,tol=1e-6,gamma=0,smo_int=0,s parser.add_argument("-e", "--epsilon", help="convergence parameter of minimization",type=float,default=1e-6) parser.add_argument("-g", "--gamma", help="regularization parameter",type=float,default=0) parser.add_argument("-m", "--maxiter", help="maximal number of iterations",type=int,default=200) -parser.add_argument("-b", "--benchmark", help="benchmark mode, don't write output POVMs",action='store_true') +parser.add_argument("-T", "--timing", help="measure timing for reconstruction, don't write output POVMs",action='store_true') +parser.add_argument("-b", "--benchmarkops", help="measure timing for underlying operations",action='store_true') +parser.add_argument("-d", "--dryrun", help="dry-run: only prepare inputs for pqdts",action='store_true') parser.add_argument("-v", "--verbose", help="be more verbose",action='store_true') args = parser.parse_args() @@ -136,12 +142,18 @@ def run_pqdts(N,M,D,P,threads,pqdtspath,maxiter=100,tol=1e-6,gamma=0,smo_int=0,s #call OpenMP-version of pqdts output=1 -if args.benchmark: +benchops=0 +if args.timing: output=0 +if args.benchmarkops: + benchops=1 if not(args.Fmatrix is None): - povm=run_pqdts(N,M,D,P,threads=args.threads,pqdtspath=args.pqdtspath,maxiter=args.maxiter,tol=args.epsilon,gamma=args.gamma,verbose=args.verbose,output=output,F=F) + povm=run_pqdts(N,M,D,P,threads=args.threads,pqdtspath=args.pqdtspath,maxiter=args.maxiter,tol=args.epsilon,gamma=args.gamma,verbose=args.verbose,output=output,F=F,dry=args.dryrun,benchops=benchops) else: - povm=run_pqdts(N,M,D,P,threads=args.threads,pqdtspath=args.pqdtspath,maxiter=args.maxiter,tol=args.epsilon,gamma=args.gamma,verbose=args.verbose,output=output) + povm=run_pqdts(N,M,D,P,threads=args.threads,pqdtspath=args.pqdtspath,maxiter=args.maxiter,tol=args.epsilon,gamma=args.gamma,verbose=args.verbose,output=output,dry=args.dryrun,benchops=benchops) + +if povm is None: + quit() #check constraints x1=0