diff --git a/python/waLBerla/tools/jobscripts/__init__.py b/python/waLBerla/tools/jobscripts/__init__.py index cafc3408aa0d926410aaef882e86ce1c359b34ea..8fa8261d5b68d11176698b9f0ecbacfabb9ed012 100644 --- a/python/waLBerla/tools/jobscripts/__init__.py +++ b/python/waLBerla/tools/jobscripts/__init__.py @@ -4,17 +4,17 @@ from __future__ import print_function, absolute_import, division, unicode_literals from datetime import timedelta -from waLBerla.tools.jobscripts.hornet import createJobscript as _cr_hornet -from waLBerla.tools.jobscripts.supermuc import createJobscript as _cr_supermuc +from waLBerla.tools.jobscripts.hornet import createJobscript as _cr_hornet +from waLBerla.tools.jobscripts.supermuc import createJobscript as _cr_supermuc from waLBerla.tools.jobscripts.supermuc_phase2 import createJobscript as _cr_supermuc2 -from waLBerla.tools.jobscripts.juqueen import createJobscript as _cr_juqueen +from waLBerla.tools.jobscripts.supermucng import createJobscript as _cr_supermuc_ng from waLBerla.tools.jobscripts.pizdaint_hybrid import createJobscript as _cr_pizdainth def createJobscript(*args, **kwargs): """ :param machine: Currently supported target machines are ``supermuc``, ``supermuc_phase2``, ``juqueen`` and ``hornet`` - :param nodes: Number of nodes to run on. You can either specify nodes or cores. + :param nodes: Number of nodes to run on. You can either specify nodes or cores. :param cores: specify eiter nodes or cores. If using more than one node the nodes have to be filled completely :param job_class: optional, the jobclass is usually computed depending on number of nodes and wall_time, this parameter overrides this :param initial_dir: initial working directory of the job, optional, defaults to home directory @@ -22,29 +22,30 @@ def createJobscript(*args, **kwargs): :param output_file: file where stdout will be redirected to by the queueing system :param input_file: file where stderr will be redirected to by the queueing system :param energy_tag: energy tag for SuperMUC[1,2] - + Use one of the following options: - - Run single program with different parameter files ( mpirun is prepended with correct number of processes ) - + + Run single program with different parameter files ( mpirun is prepended with correct number of processes ) + :param exe_name: executable name, if not specified only the jobscript header is generated :param parameter_files: list of parameter files to simulate - + Run multiple programs: - + :param commands: can be either a list of two-tuples with (executableName, configFile), which are then run in this order with mpirun or a list of string which are just appended to the jobscript file """ - if 'machine' not in kwargs: - raise ValueError("Specify which machine to use with 'machine=<supermuc,juqueen,hornet>'") + funcs = { + 'supermuc': _cr_supermuc, + 'supermuc_phase2': _cr_supermuc2, + 'supermuc_ng': _cr_supermuc_ng, + 'hornet': _cr_hornet, + 'pizdaint_hybrid': _cr_pizdainth, + } + if 'machine' not in kwargs or kwargs['machine'] not in funcs.keys(): + raise ValueError("Specify which machine to use with 'machine={}'".format(list(funcs.keys()))) if 'wall_time' in kwargs and isinstance(kwargs['wall_time'], int): kwargs['wall_time'] = timedelta(seconds=kwargs['wall_time']) - if kwargs['machine'].lower() == 'supermuc': return _cr_supermuc ( *args, **kwargs ) - if kwargs['machine'].lower() == 'supermuc_phase2': return _cr_supermuc2 ( *args, **kwargs ) - if kwargs['machine'].lower() == 'juqueen' : return _cr_juqueen ( *args, **kwargs ) - if kwargs['machine'].lower() == 'hornet' : return _cr_hornet ( *args, **kwargs ) - if kwargs['machine'].lower() == 'pizdaint_hybrid': return _cr_pizdainth ( *args, **kwargs ) - raise ValueError( "Unknown Machine: supported machines <supermuc,supermuc_phase2,juqueen,hornet,pizdaint_hybrid>" ) - \ No newline at end of file + return funcs[kwargs['machine']](*args, **kwargs) diff --git a/python/waLBerla/tools/jobscripts/juqueen.job b/python/waLBerla/tools/jobscripts/juqueen.job deleted file mode 100644 index c342751f1e61e7183c95b653dae72b9cec7fbf56..0000000000000000000000000000000000000000 --- a/python/waLBerla/tools/jobscripts/juqueen.job +++ /dev/null @@ -1,13 +0,0 @@ -# @ job_name = {job_name} -# @ comment = "{job_name}" -# @ error = {error_file}.$(jobid).out -# @ output = {output_file}.$(jobid).out -# @ environment = COPY_ALL -# @ wall_clock_limit = {wall_time} -# @ notification = never -# @ job_type = bluegene -# @ bg_size = {nodes} -# @ bg_connectivity = {bg_connectivity} -# @ queue - -cd {initial_dir} diff --git a/python/waLBerla/tools/jobscripts/juqueen.py b/python/waLBerla/tools/jobscripts/juqueen.py deleted file mode 100644 index 25795507571667b5724b43cda640e5a14b77f1db..0000000000000000000000000000000000000000 --- a/python/waLBerla/tools/jobscripts/juqueen.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import print_function, absolute_import, division, unicode_literals - -import os -import math - - -def createJobscript( wall_time = None, nodes = None, cores = None, job_class = None, - initial_dir = '~', job_name="waLBerla", - exe_name = None, parameter_files = [], commands = [], hyperthreading=2, - bg_connectivity = "Torus", - output_file=None, error_file=None, **kwargs ): - - CORES_PER_NODE = 16 * hyperthreading - - - validNodeCountSmaller512 = [32,64,128,256] - def nodeCountValid( n ): - return ( n in validNodeCountSmaller512 ) or ( n % 512 == 0 ) - - def sanitizeNodes( requestedNodes ): - """Allowed node counts on JUEQUEEN are 32,64,128,256 and multiples of 512""" - if requestedNodes in validNodeCountSmaller512: return requestedNodes - if requestedNodes % 512 == 0: return requestedNodes - - for limit in validNodeCountSmaller512: - if requestedNodes < limit: return limit - - # round up to next multiple of 512 - return int ( (requestedNodes / 512 + 1) * 512 ) - - - if nodes is not None and cores is not None: - raise ValueError("You can either specify nodes or cores - not both.") - - if hyperthreading not in [1,2,4]: - raise ValueError("JUQUEEN hyperthreading has to be 1,2 or 4 (requested %d)" %(hyperthreading,) ) - - if nodes is None and cores is None: - raise ValueError('Specify either cores or nodes.') - - if cores > CORES_PER_NODE and cores % CORES_PER_NODE != 0: - raise ValueError("When using more than one node, the number of cores has to be a multiple of 16") - - if nodes is None: - nodes = sanitizeNodes( int(math.ceil( cores / CORES_PER_NODE )) ) - if cores is None: - if not nodeCountValid( nodes ): - raise ValueError("Allowed node counts are 32,64,128,256 and multiples of 512") - cores = nodes * CORES_PER_NODE - - if not output_file: output_file = job_name - if not error_file: error_file = job_name - - - assert( nodeCountValid(nodes) ) - - tasks_per_node = min( CORES_PER_NODE, cores ) - - template_file = os.path.join( os.path.dirname( os.path.realpath(__file__) ), "juqueen.job" ) - - - result = open(template_file).read().format( cores = cores, - nodes = nodes, - initial_dir = initial_dir, - tasks_per_node = tasks_per_node, - job_name = job_name, - bg_connectivity = bg_connectivity, - output_file = output_file, - error_file = error_file, - wall_time = wall_time ) - - exec_line = "runjob --np %d --ranks-per-node %d : %s %s\n" - - if exe_name is not None: - for param_file in parameter_files: - result += exec_line %( cores, tasks_per_node, exe_name, param_file ) - - for exe_paramfile_pair in commands: - if type(exe_paramfile_pair) is not tuple: - result += exe_paramfile_pair + "\n" - else: - result += exec_line %( cores, tasks_per_node, exe_paramfile_pair[0], exe_paramfile_pair[1] ) - - - return result - diff --git a/python/waLBerla/tools/jobscripts/pizdaint_hybrid.job b/python/waLBerla/tools/jobscripts/pizdaint_hybrid.job index a4601918520bd12596ae5da00b39bd2a8c8d66b5..545ffb78e2793dd4fef867502bcb094262529572 100644 --- a/python/waLBerla/tools/jobscripts/pizdaint_hybrid.job +++ b/python/waLBerla/tools/jobscripts/pizdaint_hybrid.job @@ -2,11 +2,16 @@ #SBATCH --job-name={job_name} #SBATCH --time={wall_time} #SBATCH --nodes={nodes} +#SBATCH -o {{output_file}} +#SBATCH -o {{error_file}} #SBATCH --ntasks-per-core={tasks_per_core} #SBATCH --ntasks-per-node={tasks_per_node} #SBATCH --cpus-per-task={cpus_per_task} #SBATCH --partition=normal #SBATCH --constraint=gpu +{additional_lines} + +cd {initial_dir} module load daint-gpu diff --git a/python/waLBerla/tools/jobscripts/pizdaint_hybrid.py b/python/waLBerla/tools/jobscripts/pizdaint_hybrid.py index b9d369cc058d005376fbdba0cb1babd14e886e39..2f119fb0e2647036319f307d5cf08593503b8192 100644 --- a/python/waLBerla/tools/jobscripts/pizdaint_hybrid.py +++ b/python/waLBerla/tools/jobscripts/pizdaint_hybrid.py @@ -4,19 +4,19 @@ import os import math - -def createJobscript( wall_time = None, nodes = None, cores = None, initial_dir=None, job_name="waLBerla", - exe_name=None, parameter_files=[], commands=[], hyperthreading=1, **kwargs ): +def createJobscript(wall_time=None, nodes=None, cores=None, initial_dir=None, job_name="waLBerla", + exe_name=None, parameter_files=[], commands=[], hyperthreading=1, + output_file=None, error_file=None, account=None, **kwargs): if type(hyperthreading) is bool: hyperthreading = 2 if hyperthreading else 1 CORES_PER_NODE = 12 * hyperthreading - if wall_time and wall_time.total_seconds() > 24 * 3600: + if wall_time and wall_time.total_seconds() > 24 * 3600: raise ValueError("No jobs longer that 24h allowed") if hyperthreading > 2: - raise ValueError("PizDaint supports only two way hyperthreading (requested %d)" %(hyperthreading,) ) + raise ValueError("PizDaint supports only two way hyperthreading (requested %d)" % (hyperthreading,)) if nodes is not None and cores is not None: raise ValueError("You can either specify nodes or cores - not both.") @@ -25,30 +25,41 @@ def createJobscript( wall_time = None, nodes = None, cores = None, initial_dir=N raise ValueError('Specify either cores or nodes.') if nodes is None: - nodes = math.ceil( cores / CORES_PER_NODE ) + nodes = math.ceil(cores / CORES_PER_NODE) if cores is None: cores = nodes * CORES_PER_NODE if cores > CORES_PER_NODE and cores % CORES_PER_NODE != 0: raise ValueError("When using more than one node, the number of cores has to be a multiple of 12") - tasks_per_node = min( CORES_PER_NODE, cores ) - - template_file = os.path.join( os.path.dirname( os.path.realpath(__file__) ), "pizdaint_hybrid.job" ) - result = open(template_file).read().format( cores = cores, - nodes = nodes, - tasks_per_core = hyperthreading, - tasks_per_node=tasks_per_node, - cpus_per_task = 1, # OpenMP num threads would go here - initial_dir = initial_dir, - job_name = job_name, - wall_time = wall_time ) + if not output_file: + output_file = job_name + if not error_file: + error_file = job_name + + tasks_per_node = min(CORES_PER_NODE, cores) + additional_lines = "" + if account: + additional_lines += '#SBATCH --account=%s\n' % (account,) + + template_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "pizdaint_hybrid.job") + result = open(template_file).read().format(cores=cores, + nodes=nodes, + tasks_per_core=hyperthreading, + tasks_per_node=tasks_per_node, + cpus_per_task=1, # OpenMP num threads would go here + initial_dir=initial_dir, + output_file=output_file, + additional_lines=additional_lines, + error_file=error_file, + job_name=job_name, + wall_time=wall_time) exec_line = "srun %s %s \n" if exe_name is not None: for param_file in parameter_files: - result += exec_line %( cores, exe_name, param_file ) + result += exec_line % (cores, exe_name, param_file) for exe_paramfile_pair in commands: if type(exe_paramfile_pair) is not tuple: @@ -56,6 +67,4 @@ def createJobscript( wall_time = None, nodes = None, cores = None, initial_dir=N else: result += exec_line % exe_paramfile_pair - return result - diff --git a/python/waLBerla/tools/jobscripts/supermucng.job b/python/waLBerla/tools/jobscripts/supermucng.job new file mode 100644 index 0000000000000000000000000000000000000000..0691c51afd4a6912540a0cc8775d7664eba6da3b --- /dev/null +++ b/python/waLBerla/tools/jobscripts/supermucng.job @@ -0,0 +1,16 @@ +#!/bin/bash -l +#SBATCH --job-name={job_name} +#SBATCH --time={wall_time} +#SBATCH --nodes={nodes} +#SBATCH -o {{output_file}} +#SBATCH -o {{error_file}} +#SBATCH --ntasks-per-core={tasks_per_core} +#SBATCH --ntasks-per-node={tasks_per_node} +#SBATCH --cpus-per-task={cpus_per_task} +#SBATCH --partition={partition} +{additional_lines} + +cd {initial_dir} + +export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK +export OMP_PLACES={omp_places} diff --git a/python/waLBerla/tools/jobscripts/supermucng.py b/python/waLBerla/tools/jobscripts/supermucng.py new file mode 100644 index 0000000000000000000000000000000000000000..cce31dadc446ff37c52ac949dcd0e10dc3a7de02 --- /dev/null +++ b/python/waLBerla/tools/jobscripts/supermucng.py @@ -0,0 +1,96 @@ +from __future__ import print_function, absolute_import, division, unicode_literals + +import os +import math + + +def createJobscript(wall_time=None, nodes=None, cores=None, job_class=None, initial_dir='~', job_name="waLBerla", + exe_name=None, parameter_files=[], commands=[], hyperthreading=1, + output_file=None, error_file=None, account=None, fixed_freq=True, omp_num_threads=1, **_): + if type(hyperthreading) == bool: + hyperthreading = 2 if hyperthreading else 1 + + cores_per_node = 48 * hyperthreading + + if wall_time and wall_time.total_seconds() > 48 * 3600: + raise ValueError("No jobs longer that 48h allowed") + + if hyperthreading > 2: + raise ValueError("SuperMUC supports only two way hyperthreading (requested %d)" % (hyperthreading,)) + + if nodes is not None and cores is not None: + raise ValueError("You can either specify nodes or cores - not both.") + + if nodes is None and cores is None: + raise ValueError('Specify either cores or nodes.') + + if nodes is None: + nodes = math.ceil(cores / cores_per_node) + if cores is None: + cores = nodes * cores_per_node + + if cores > cores_per_node and cores % cores_per_node != 0: + raise ValueError("When using more than one node, the number of cores has to be a multiple of %d", (cores_per_node,)) + + if not output_file: + output_file = job_name + if not error_file: + error_file = job_name + + if not job_class: + if nodes <= 16: + if wall_time.total_seconds() < 30 * 60: + job_class = 'test' + else: + job_class = 'micro' + elif nodes <= 792: + job_class = 'general' + elif nodes <= 3168: + job_class = 'big' + else: + job_class = 'special' + + tasks_per_node = min(cores_per_node, cores) + + omp_places = "cores" if hyperthreading == 1 else "threads" + + template_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "supermucng.job") + additional_lines = "" + if account: + additional_lines += '#SBATCH --account=%s' % (account,) + if fixed_freq: + additional_lines += '#SBATCH --ear=off\n' + + result = open(template_file).read().format(cores=cores, + nodes=nodes, + initial_dir=initial_dir, + tasks_per_node=tasks_per_node, + tasks_per_core=hyperthreading, + cpus_per_task=omp_num_threads, + partition=job_class, + job_name=job_name, + wall_time=wall_time, + omp_places=omp_places, + output_file=output_file, + additional_lines=additional_lines, + error_file=error_file) + + exec_line = "mpiexec -n %d %s %s \n" + + if exe_name is not None: + for param_file in parameter_files: + result += exec_line % (cores, exe_name, param_file) + + for exe_paramfile_pair in commands: + if type(exe_paramfile_pair) is not tuple: + result += exe_paramfile_pair + "\n" + else: + result += exec_line % (cores, exe_paramfile_pair[0], exe_paramfile_pair[1]) + + return result + + +if __name__ == '__main__': + from waLBerla.tools.jobscripts import createJobscript + print(createJobscript(wall_time=60*60, nodes=4, exe_name='grandchem', parameter_files=['a.cfg', 'b.cfg'], + machine='supermuc_ng'))