added dry-run option to convert python inputs for latter large-scale …

…mpi-run, additional POVM ouput at end instead of after every stage, on-node compression of output POVMs before writing to parallel file system, additionalö option to disable benchmark of underlying operations
pc2 · Apr 8, 2024 · b44d06b · b44d06b
1 parent 3cbaf9c
commit b44d06b
Show file tree

Hide file tree

Showing 6 changed files with 128 additions and 37 deletions.
diff --git a/Makefile b/Makefile
@@ -2,7 +2,7 @@
 CC = g++ # C++ compiler
 FCC = gfortran #fortran compiler
 PFCC = mpif90 #fortran compile with MPI
-CFLAGS = -O3 -march=native -fopenmp -g -flto -fno-strict-aliasing
+CFLAGS = -O3 -march=native -fopenmp -g -fargument-noalias -ffreestanding -finline-functions -funroll-all-loops -fprefetch-loop-arrays
 CFFLAGS = -ffree-line-length-none -Werror=aliasing -Werror=ampersand -Werror=c-binding-type -Werror=intrinsic-shadow -Werror=intrinsics-std -Werror=line-truncation -Werror=tabs -Werror=target-lifetime -Werror=underflow -Werror=unused-but-set-variable -Werror=unused-variable -Werror=unused-parameter -Werror=unused-label -Werror=conversion -Werror=zerotrip -Wno-maybe-uninitialized -Wuninitialized -Wuse-without-only -fno-strict-aliasing
 
 #don't change the lines below

diff --git a/README.md b/README.md
@@ -37,19 +37,15 @@ The most convenient way of using the program is with the included Python wrapper
 This wrapper has the following options:
 
 ```
-usage: pqdts.py [-h] -P PMATRIX [-F FMATRIX] [-D DMAX] [-t THREADS] -p
-                PQDTSPATH [-o OUTPUT] [-e EPSILON] [-g GAMMA] [-m MAXITER]
-                [-b] [-v]
+usage: pqdts.py [-h] -P PMATRIX [-F FMATRIX] [-D DMAX] [-t THREADS] -p PQDTSPATH [-o OUTPUT] [-e EPSILON] [-g GAMMA] [-m MAXITER] [-T] [-b] [-d] [-v]
 
-optional arguments:
+options:
   -h, --help            show this help message and exit
   -P PMATRIX, --Pmatrix PMATRIX
-                        path to npz file (scipy sparse) or npy file (numpy) of
-                        P matrix (dimension D x N)
+                        path to npz file (scipy sparse) or npy file (numpy) of P matrix (dimension D x N)
   -F FMATRIX, --Fmatrix FMATRIX
-                        path to npz file (scipy sparse) or npy file (numpy) of
-                        F matrix (dimension D x M)
-  -D DMAX, --Dmax DMAX  truncate to D so that P is a D x M matrix
+                        path to npz file (scipy sparse) or npy file (numpy) of F matrix (dimension D x M)
+  -D DMAX, --Dmax DMAX  truncate to D so that P is a D x N matrix
   -t THREADS, --threads THREADS
                         numper of OpenMP threads to use
   -p PQDTSPATH, --pqdtspath PQDTSPATH
@@ -62,9 +58,12 @@ optional arguments:
                         regularization parameter
   -m MAXITER, --maxiter MAXITER
                         maximal number of iterations
-  -b, --benchmark       benchmark mode, don't write output POVMs
+  -T, --timing          measure timing for reconstruction, don't write output POVMs
+  -b, --benchmarkops    measure timing for underlying operations
+  -d, --dryrun          dry-run: only prepare inputs for pqdts
   -v, --verbose         be more verbose
 ```
+
 The dependencies of the wrapper can be installed with `pip3 install -r requirements.txt`.
 
 ### Command Line Arguments
@@ -76,12 +75,13 @@ Without the Python wrapper, the command line arguments of `pqdts_omp.x` and `pqd
 5. number of non-zero elements in $P$
 6. computation mode: 2 for two-metric projected truncated Newton method, 1 for projected gradient method 
 7. maxiter: maximal number of iterations in stages
-8. output: 0 to disable output of the POVMs, 1 to enable output of POVMs after every minimization stage
+8. output: 0 to disable output of the POVMs, 1 to enable output of POVMs at the end, 2 to enable output of POVMs after every minimization stage
 9. gamma: regularization parameter $\gamma$
 10. epsilon: value of the convergence criterion
 11. index of stage to start with, i.e., 1 or 2
 12. 0 to start with the initial guess $\varPi=1/N$, 1 to read the output from a previous run as a starting point
 13. smoothing distance factor $N_s$
+14. benchmark underlying operations: 0 for no, 1 for yes
 
 ### Inputs
 Without the Python wrapper, the programs `pqdts_omp.x` and `pqdts_omp_mpi.x` expect inputs ($P$ matrix and optionally the $F$ matrix) in the `data` directory in the current working directory. The following files are expected:

diff --git a/condat_simplexproj.c b/condat_simplexproj.c
@@ -59,7 +59,7 @@
 extern "C"{
 void simplexproj_condat_(double* y, double* x, int* plength) {
         int length=plength[0];
-	double*	aux = (x==y ? (double*)malloc(length*sizeof(double)) : x);
+	double*	aux = x; //(x==y ? (double*)malloc(length*sizeof(double)) : x);
 	double*  aux0=aux;
 	int		auxlength=1; 
 	int		auxlengthold=-1;	
@@ -91,6 +91,6 @@ void simplexproj_condat_(double* y, double* x, int* plength) {
 	} while (auxlength<=auxlengthold);
 	for (i=0; i<length; i++)
 		x[i]=(y[i]>tau ? y[i]-tau : 0.0); 
-	if (x==y) free(aux0);
+//	if (x==y) free(aux0);
 }
 } 
diff --git a/job.sh b/job.sh
@@ -1,21 +1,49 @@
 #!/bin/bash
-#SBATCH -t 1-0
-#SBATCH -N 1
-#SBATCH -n 1
+#SBATCH -t 0:10:00
+#SBATCH --mem=230G
 #SBATCH --exclusive
-#SBATCH --cpus-per-task=128
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=16
+#SBATCH -x n2dgx01,n2gpu12[01-32],n2hcn[01-05],n2hacc[01-03],n2cn[1101-1196]
 
+#module reset
+#module load perf/OSU-Micro-Benchmarks/5.7.1-gompi-2021b
+#srun --ntasks-per-node=1 -n $SLURM_JOB_NUM_NODES osu_mbw_mr  > osu.out
 
 module reset
 module load toolchain/foss/2023b
 module load lang/Python/3.11.5-GCCcore-13.2.0
+module load lib/zstd/1.5.5-GCCcore-13.2.0
 source ./pqdts_env/bin/activate
 
+make clean
+make pqdts
+make pqdts_mpi
+
 export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
 export OMP_PLACES=cores
 export OMP_PROC_BIND=true
 
-D=$1
-mkdir simulated_D_8200_Dmax_${D}
-cd simulated_D_8200_Dmax_${D}
-python3 ../pqdts.py -P ../../det-tomo/simulation/data/simulatedP_D_8200.npz -t $SLURM_CPUS_PER_TASK --pqdtspath ../pqdts_omp.x -m 1000 -g 0.0 -e 0.000000001 -D $D -v -b > simulated_D_8200_Dmax_${D}.out
+D=`echo "sqrt(200000000000/8/6/151*$SLURM_JOB_NUM_NODES)" | bc`
+#eps=`echo "0.000001/$D" | bc -l`
+eps=`echo "0.0000001/$D" | bc -l`
+#eps=1
+dir="simulated_D_260001_Nodes_${SLURM_JOB_NUM_NODES}_Dmax_${D}_$1"
+rm -rf $dir
+mkdir $dir
+cd $dir
+
+cp ../pqdts_omp_mpi.x .
+python ../pqdts.py -P ../simulatedP_D_260001.npz -p ./pqdts_omp_mpi.x -v -b -d -D $D -e $eps > prep.out 2> prep.err
+
+cmd=`tail -n1 prep.out`
+wd=`pwd`
+srun --ntasks-per-node=1 -n $SLURM_JOB_NUM_NODES cp pqdts_omp_mpi.x  /dev/shm
+srun --ntasks-per-node=1 -n $SLURM_JOB_NUM_NODES cp -r data /dev/shm
+
+cd /dev/shm
+srun $cmd > $wd/run.out 2> $wd/run.err
+
+#compress output
+srun --ntasks-per-node=1 -n $SLURM_JOB_NUM_NODES bash $wd/../compress_and_copy.sh $wd
+
diff --git a/pqdts.f90 b/pqdts.f90
@@ -183,7 +183,6 @@ subroutine tprint(iter)
       write (43, *) iter, timingn(i), timingc(i), timingsum(i)
     end do
     call flush (43)
-
   end subroutine
 
   subroutine fprob(x, mean, prob)
@@ -1064,6 +1063,7 @@ subroutine conv_test(X, DLDX, tprint, conv)
     call MPI_ALLREDUCE(MPI_IN_PLACE, conv, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD, ierr)
 #endif
     conv = sqrt(conv/(real(N, kind=real64)*real(M, kind=real64)))
+    !conv = sqrt(conv/(real(N, kind=real64)))
   end subroutine
 
   subroutine read_unformatted_binary_from_python(M, N, filename, A)
@@ -1202,7 +1202,7 @@ program pqdts
   real(kind=real64) :: conv, xmin
   logical :: tproj = .true.
   logical, parameter :: testder = .false.
-  logical, parameter :: tbench = .false.
+  logical :: tbench = .false.
   real(kind=real64), parameter :: dxp = 1e-7
   real(kind=real64) :: tactivetol = 1e-8
   real(kind=real64) :: conv_thres = 1e-6
@@ -1214,7 +1214,7 @@ program pqdts
   integer(kind=int32) :: tactive = 0
   logical :: precond = .false.
   real(kind=real64) :: Fsize = 0, dlalpha, smo_fact
-  integer(kind=int32) :: output, state, start_stage, rank2, read_input
+  integer(kind=int32) :: output, state, start_stage, rank2, read_input, bench
   integer(kind=int64) :: nact
   integer(kind=int64) :: M2, N2, D2, localrows2
   Character(len=255) :: str
@@ -1266,6 +1266,12 @@ program pqdts
   read (arg, *) read_input
   CALL getarg(13, arg)
   read (arg, *) smo_fact
+  CALL getarg(14, arg)
+  read (arg, *) bench
+
+  if(bench.eq.1)then
+    tbench=.true.
+  endif
 
 #ifdef MPI
   call MPI_Init(ierror)
@@ -1309,7 +1315,6 @@ program pqdts
   end if
 
   print *, rank, "localrows", localrows0, localrows1, localrows, rowdist(localrows0), rowdist(localrows1)
-
   if (Fl .ge. 0) then
     !FIXME only read F partially
     call read_sparse_from_python(Fl, "data/F_row.bin", "data/F_col.bin", "data/F_data.bin", Fi, Fj, Fd)
@@ -1531,7 +1536,7 @@ program pqdts
   else
     call random_number(P)
   end if
-
+  
   allocate (X(localrows, N))
   call pset(M, N, 0.0D0, x)
   allocate (dLdX(localrows, N))
@@ -1594,10 +1599,10 @@ program pqdts
 #endif
       t1 = omp_get_wtime()
       call dL(X, dLdX)
-      t2 = omp_get_wtime()
 #ifdef MPI
       call MPI_Barrier(MPI_COMM_WORLD, ierr)
 #endif
+      t2 = omp_get_wtime()
       if (rank .eq. 0) print *, "t dL", t2 - t1, dLdX(1, 1)
     end do
 
@@ -2131,8 +2136,11 @@ program pqdts
       end if
 
     end do
-    call conv_test(X, DLDX, output.ne.0, conv)
-    if (output .ge. 1) then
+    call conv_test(X, DLDX, output.ge.3, conv)
+    if (output .ge. 2) then
+#ifdef MPI
+      call MPI_Barrier(MPI_COMM_WORLD, ierr)
+#endif
       call tstart("output")
       !export
       write (outname, '(a,i6,a,i6,a)') "rank_", rank, "_oiter", oiter, ".dat"
@@ -2151,11 +2159,51 @@ program pqdts
       write (42) X
       close (42)
       call tstop("output")
+#ifdef MPI
+      call MPI_Barrier(MPI_COMM_WORLD, ierr)
+#endif
     end if
     call tprint(oiter)
   end do
   call tstop("min")
+
+  if (output .ge. 1) then
+    !some deallcoation to free memory to write to /dev/shm for compression
+    deallocate(dldx)
+    deallocate(sn)
+    deallocate(xtmp)
+    deallocate(O)
+    deallocate(z)
+    deallocate(ik)
+#ifdef MPI
+    call MPI_Barrier(MPI_COMM_WORLD, ierr)
+#endif
+    call tstart("output")
+    !export
+    write (outname, '(a,i6,a,i6,a)') "rank_", rank, "_final.dat"
+    print *, outname
+    INQUIRE (FILE=outname, EXIST=file_exists)
+    if (file_exists) then
+      open (42, file=trim(outname), status='old', FORM='unformatted')
+    else
+      open (42, file=trim(outname), status='new', FORM='unformatted')
+    end if
+    write (42) M
+    write (42) N
+    write (42) D
+    write (42) localrows
+    write (42) rank
+    write (42) X
+    close (42)
+    call tstop("output")
+#ifdef MPI
+    call MPI_Barrier(MPI_COMM_WORLD, ierr)
+#endif
+  end if
   call tprint(-1)
+#ifdef MPI
+  call MPI_Barrier(MPI_COMM_WORLD, ierr)
+#endif
 
   !get memory usage
   open (43, file="/proc/self/status", status='old', iostat=state)
@@ -2173,6 +2221,9 @@ program pqdts
   end do
   close (43)
   close (44)
+#ifdef MPI
+  call MPI_Barrier(MPI_COMM_WORLD, ierr)
+#endif
 
 #ifdef MPI
   call MPI_Finalize(ierror)

diff --git a/pqdts.py b/pqdts.py
@@ -27,7 +27,7 @@ def npz_to_fortran(npz,matname):
     coodata.tofile('data/'+matname+'_data.bin')
     return len(coo.data)
 
-def run_pqdts(N,M,D,P,threads,pqdtspath,maxiter=100,tol=1e-6,gamma=0,smo_int=0,smo_dist=0,start_stage=1,read_input=0,smo_fact=0,output=1,verbose=False,F=None):
+def run_pqdts(N,M,D,P,threads,pqdtspath,maxiter=100,tol=1e-6,gamma=0,smo_int=0,smo_dist=0,start_stage=1,read_input=0,smo_fact=0,output=1,verbose=False,F=None,dry=False,benchops=0):
     #set environment variables for OpenMP
     os.environ["OMP_NUM_THREADS"] = str(threads)
     os.environ["OMP_PROC_BIND"] = "True"
@@ -38,10 +38,14 @@ def run_pqdts(N,M,D,P,threads,pqdtspath,maxiter=100,tol=1e-6,gamma=0,smo_int=0,s
     if not (F is None):
         NF=npz_to_fortran(F,"F")
     #call pqdts
-    cmd=pqdtspath+" "+str(M)+" "+str(N)+" "+str(D)+" "+str(NF)+" "+str(NP)+" 2 "+str(maxiter)+" "+str(output)+" "+str(gamma)+" "+str(tol)+" "+str(start_stage)+" "+str(read_input)+" "+str(smo_fact)
+    cmd=pqdtspath+" "+str(M)+" "+str(N)+" "+str(D)+" "+str(NF)+" "+str(NP)+" 2 "+str(maxiter)+" "+str(output)+" "+str(gamma)+" "+str(tol)+" "+str(start_stage)+" "+str(read_input)+" "+str(smo_fact)+" "+str(benchops)
+    if dry:
+        print("not executing:")
+        print(cmd)
+        return None
+
     if verbose:
         print("executing:",cmd)
-
     start = time.time()
     out=subprocess.run(cmd, shell=True, capture_output=True)
     end = time.time()
@@ -93,7 +97,9 @@ def run_pqdts(N,M,D,P,threads,pqdtspath,maxiter=100,tol=1e-6,gamma=0,smo_int=0,s
 parser.add_argument("-e", "--epsilon", help="convergence parameter of minimization",type=float,default=1e-6)
 parser.add_argument("-g", "--gamma", help="regularization parameter",type=float,default=0)
 parser.add_argument("-m", "--maxiter", help="maximal number of iterations",type=int,default=200)
-parser.add_argument("-b", "--benchmark", help="benchmark mode, don't write output POVMs",action='store_true')
+parser.add_argument("-T", "--timing", help="measure timing for reconstruction, don't write output POVMs",action='store_true')
+parser.add_argument("-b", "--benchmarkops", help="measure timing for underlying operations",action='store_true')
+parser.add_argument("-d", "--dryrun", help="dry-run: only prepare inputs for pqdts",action='store_true')
 parser.add_argument("-v", "--verbose", help="be more verbose",action='store_true')
 args = parser.parse_args()
 
@@ -136,12 +142,18 @@ def run_pqdts(N,M,D,P,threads,pqdtspath,maxiter=100,tol=1e-6,gamma=0,smo_int=0,s
 
 #call OpenMP-version of pqdts
 output=1
-if args.benchmark:
+benchops=0
+if args.timing:
     output=0
+if args.benchmarkops:
+    benchops=1
 if not(args.Fmatrix is None):
-    povm=run_pqdts(N,M,D,P,threads=args.threads,pqdtspath=args.pqdtspath,maxiter=args.maxiter,tol=args.epsilon,gamma=args.gamma,verbose=args.verbose,output=output,F=F)
+    povm=run_pqdts(N,M,D,P,threads=args.threads,pqdtspath=args.pqdtspath,maxiter=args.maxiter,tol=args.epsilon,gamma=args.gamma,verbose=args.verbose,output=output,F=F,dry=args.dryrun,benchops=benchops)
 else:
-    povm=run_pqdts(N,M,D,P,threads=args.threads,pqdtspath=args.pqdtspath,maxiter=args.maxiter,tol=args.epsilon,gamma=args.gamma,verbose=args.verbose,output=output)
+    povm=run_pqdts(N,M,D,P,threads=args.threads,pqdtspath=args.pqdtspath,maxiter=args.maxiter,tol=args.epsilon,gamma=args.gamma,verbose=args.verbose,output=output,dry=args.dryrun,benchops=benchops)
+
+if povm is None:
+    quit()
 
 #check constraints
 x1=0