Source code for run

import sys
from datetime import datetime
import glob
import json
from logging import getLogger
import os
import random
import stat
import string
import tempfile

import yaml

import abstract_step as abst
import command as command_info
import exec_group
import pipeline_info
import misc

logger = getLogger("uap_logger")

[docs]class Run(object): ''' The Run class is a helper class which represents a run in a step. Declare runs inside AbstractStep.runs() via:: with self.new_run(run_id) as run: # declare output files, private and public info here After that, use the available methods to configure the run. The run has typically no information about input connections only about input files. ''' def __init__(self, step, run_id): if '/' in run_id: logger.error("Error: A run ID must not contain a slash: %s." % run_id) sys.exit(1) self._step = step ''' Step this run belongs to. ''' self._run_id = run_id ''' Identifier of this run. ''' self._private_info = dict() self._public_info = dict() self._input_files = list() self._output_files = dict() for out_connection in self._step.get_out_connections(): self.add_out_connection(out_connection) ''' Dictionary containing the output files for each outgoing connection and their corresponding input files:: annotation_1: out_path_1: [in_path_1, in_path_2, ...] out_path_2: ... annotation_2: ... ''' self._ping_files = { 'run': None, 'queued': None } self._submit_script = None self._exec_groups = list() self._temp_paths = list() ''' List of temporary paths which can be either files or paths ''' self._temp_directory = None ''' Contains path to currently used temporary directory if set. ''' self._known_paths = dict() def __enter__(self): return self def __exit__(self, type, value, traceback): pass def new_exec_group(self): eg = exec_group.ExecGroup(self) self._exec_groups.append(eg) return eg def get_exec_groups(self): return self._exec_groups def get_step(self): return self._step def set_run_id(self, run_id): self._run_id = run_id def get_run_id(self): return self._run_id def get_out_connections(self): return self._output_files.keys() def get_out_connection(self, connection): if not connection.startswith("out/"): connection = 'out/' + connection if connection in self.get_out_connections(): return connection else: logger.error("Connection %s not declared for step %s" % (connection, self.get_step())) sys.exit(1) @property def annotation_file(self): afl = glob.glob( os.path.join( self.get_output_directory(), ".%s-annotation-*.yaml" % self.get_run_id())) if not afl: print(self.get_output_directory()) return "" elif len(afl) != 1: raise StandardError("Found multiple annotation files: %s" % ", ".join(afl)) elif os.path.isfile(afl[0]): return afl[0] def _get_ping_file(self, key): if self._ping_files[key] == None: self._ping_files[key] = os.path.join( self.get_output_directory(), '.%s-%s-ping.yaml' % (self.get_run_id(), key) ) return self._ping_files[key] def get_executing_ping_file(self): return self._get_ping_file('run') def get_queued_ping_file(self): return self._get_ping_file('queued') def get_submit_script_file(self): if self._submit_script == None: self._submit_script = os.path.join( self.get_output_directory(), ".submit-%s-%s.sh" % (self.get_step().get_step_name(), self.get_run_id()) ) return self._submit_script def is_source(self): return True if isinstance(self._step, abst.AbstractSourceStep) else False def replace_output_dir_du_jour(func): def inner(self, *args, **kwargs): # Collect info to replace du_jour placeholder with temp_out_dir step = self.get_step() placeholder = self.get_output_directory_du_jour_placeholder() temp_out_dir = self.get_output_directory_du_jour() # if not temp_out_dir: # placeholder += os.sep value = None ret_value = func(self, *args, **kwargs) # If currently calling AbstractStep.runs() do nothing if temp_out_dir == None: value = ret_value elif isinstance(ret_value, list): value = list() for string in ret_value: if string != None and placeholder in string: value.append( string.replace(placeholder, temp_out_dir)) else: value.append(string) elif isinstance(ret_value, str): if ret_value != None and placeholder in ret_value: value = ret_value.replace(placeholder, temp_out_dir) elif isinstance(ret_value, dict): for key in ret_value.keys(): if key != None and placeholder in key: new_key = key.replace(placeholder, temp_out_dir) ret_value[new_key] = ret_value.pop(key) value = ret_value elif ret_value == None: value = None else: logger.error("Function %s does not return list or string object" % func.__class__.__name__) sys.exit(1) return(value) return(inner) @replace_output_dir_du_jour def get_known_paths(self): return self._known_paths def add_known_paths(self, known_paths_dict): self._known_paths.update(known_paths_dict) @replace_output_dir_du_jour def get_temp_paths(self): ''' Returns a list of all temporary paths which belong to this run. ''' return self._temp_paths
[docs] def get_output_directory_du_jour_placeholder(self): ''' Returns a placeholder for the temporary output directory, which needs to be replaced by the actual temp directory inside the abstract_step.execute() method ''' return("<%s-output-directory-du-jour>" % str(self.get_step().__class__.__name__))
[docs] def get_output_directory_du_jour(self): ''' Returns the state-dependent output directory of the step. Returns this steps output directory according to its current state: - if we are currently calling a step's declare_runs() method, this will return None - if we are currently calling a step's execute() method, this will return the temporary directory - otherwise, it will return the real output directory ''' if self.get_step()._state == abst.AbstractStep.states.DEFAULT: return self.get_output_directory() elif self.get_step()._state == abst.AbstractStep.states.EXECUTING: return self.get_temp_output_directory() # return "" else: return None
# return ""
[docs] def get_temp_output_directory(self): ''' Returns the temporary output directory of a run. ''' if self._temp_directory == None: while True: current_time = datetime.now().strftime('%y%m%d-%H%M%S-%f') path = os.path.join( self.get_step().get_pipeline().config['destination_path'], 'temp', 'temp-%s-%s-%s' % (self.get_step().get_step_name(), self.get_run_id(), current_time)) if not os.path.exists(path): self._temp_directory = path return self._temp_directory return self._temp_directory
[docs] def get_execution_hashtag(self): ''' Creates a hash tag based on the commands to be executed. This causes runs to be marked for rerunning if the commands to be executed change. ''' # Store step state previous_state = self.get_step()._state # Set step state to DECLARING to avoid circular dependencies self.get_step()._state = abst.AbstractStep.states.DECLARING cmd_by_eg = dict() eg_count = 0 for exec_group in self.get_exec_groups(): eg_count += 1 cmd_by_eg[eg_count] = dict() pipe_count, cmd_count = (0, 0) for poc in exec_group.get_pipes_and_commands(): # for each pipe or command (poc) # check if it is a pipeline ... if isinstance(poc, pipeline_info.PipelineInfo): pipe_count += 1 cmd_by_eg[eg_count]['Pipe %s' % pipe_count] = list() for command in poc.get_commands(): cmd_by_eg[eg_count]['Pipe %s' % pipe_count].append( command.get_command()) # ... or a command elif isinstance(poc, command_info.CommandInfo): cmd_count += 1 cmd_by_eg[eg_count]['Cmd %s' % cmd_count] = poc.get_command() # Set step state back to original state self.get_step()._state = previous_state return misc.str_to_sha1_b62(json.dumps(cmd_by_eg))[0:8]
[docs] def get_output_directory(self): ''' Returns the final output directory. ''' return os.path.join( self.get_step().get_pipeline().config['destination_path'], self.get_step().get_step_name(), '%s-%s' % (self.get_run_id(), self.get_execution_hashtag()) )
[docs] def get_basic_state(self): ''' Determines basic run state of a run. Determine the basic run state of a run, which is, at any time, one of **waiting**, **ready**, or **finished**. These states are determined from the current configuration and the timestamps of result files present in the file system. In addition to these three basic states, there are two additional states which are less reliable (see *get_run_state()*). ''' def volatile_path_good(volatile_path, recurse = True): ''' This function receives a volatile path and tries to load the placeholder YAML data structure. It then checks all downstream paths, which may in turn be volatile placeholder files. ''' # reconstruct original path from volatile placeholder path path = volatile_path[:-len(AbstractStep.VOLATILE_SUFFIX)] if AbstractStep.fsc.exists(path): # the original file still exists, ignore volatile placeholder return False if not path in self.get_step().get_pipeline()\ .task_id_for_output_file: # there is no task which creates the output file return False task_id = self.get_step().get_pipeline().task_id_for_output_file[path] task = self.get_step().get_pipeline().task_for_task_id[task_id] # if not task.step.options['_volatile']: if not task.step._options['_volatile']: # the task is not declared volatile return False if not AbstractStep.fsc.exists(volatile_path): # the volatile placeholder does not exist return False if not recurse: return True try: # try to parse the YAML contents info = AbstractStep.fsc.load_yaml_from_file(volatile_path) except yaml.scanner.ScannerError: # error scanning YAML return False # now check whether all downstream files are in place and up-to-date # also check whether all downstream files as defined in # file_dependencies_reverse are covered uncovered_files = set() if path in self.get_step().get_pipeline().file_dependencies_reverse: uncovered_files = self.get_step().get_pipeline()\ .file_dependencies_reverse[path] for downstream_path, downstream_info in info['downstream'].items(): if downstream_path in self.get_step().get_pipeline()\ .task_id_for_output_file: # only check this downstream file if there's a task which # creates it, otherwise, it may be a file which is no more # used pv_downstream_path = change_to_volatile_if_need_be( downstream_path, recurse = False) if not AbstractStep.fsc.exists(pv_downstream_path): return False if not AbstractStep.fsc.getmtime(pv_downstream_path) >= \ info['self']['mtime']: return False if downstream_path in uncovered_files: uncovered_files.remove(downstream_path) if len(uncovered_files) > 0: # there are still files defined which are not covered by the # placeholder return False return True def change_to_volatile_if_need_be(path, recurse = True): ''' Changes the file path to volatile path if necessary. ''' if path != None: if not AbstractStep.fsc.exists(path): # the real output file does not exist volatile_path = path + AbstractStep.VOLATILE_SUFFIX if volatile_path_good(volatile_path, recurse): return volatile_path return path def is_path_up_to_date(outpath, inpaths): ''' First, replace paths with volatile paths if the step is marked as volatile and the real path is missing. But, only consider volatile placeholders if all child tasks are finished. That means if a child of a volatile step needs to be run because it has been added or an existing step has been modified, the volatile placeholders are ignored, thus turning the task from 'finished' to 'ready' or 'waiting' Hint: The pv_ prefix is for 'possibly volatile' ''' pv_outpath = outpath pv_inpaths = list() if outpath in self.get_step().get_pipeline().task_id_for_output_file: pv_outpath = change_to_volatile_if_need_be(outpath) for inpath in inpaths: pv_inpaths.append(change_to_volatile_if_need_be(inpath)) if not AbstractStep.fsc.exists(pv_outpath): return False for pv_inpath in pv_inpaths: if not AbstractStep.fsc.exists(pv_inpath): return False if AbstractStep.fsc.getmtime(pv_inpath) > \ AbstractStep.fsc.getmtime(pv_outpath): return False return True def up_to_dateness_level(path, level = 0): result = level if path != None: dep_paths = self.get_step().get_pipeline().file_dependencies[path] if not is_path_up_to_date(path, dep_paths): result = level + 1 for dep_path in dep_paths: recursive_result = up_to_dateness_level(dep_path, level + 1) if recursive_result > level + 1: result = max(result, recursive_result) return result ''' - finished: all output files exist AND up to date (recursively) - ready: NOT all output files exist AND all input files exist AND up to date (recursively) - waiting: otherwise - if it's ready, it might be executing or queued -> check execute and queue ping - if it's waiting, it might be queued -> check queue ping the ping works like this (this example is for execute, same goes for queued): - there's a ping file for every task ( = step + run) - it contains information about when, how, where the job was started etc. - its timestamp gets renewed every 30 seconds (touch) - as soon as the job has finished, the execute ping file is removed, this should also work if the job crashes (however, it cannot work if the controlling script receives SIGKILL - if its timestamp is no more than 5 minutes old, it is regarded as currently executing - otherwise, a warning is printed because the ping file is probably stale (no automatic cleanup is performed, manual intervention is necessary) - warning: this requires all involved systems or the file system to be time-synchronized ''' run_info = self.get_runs() max_level = 0 for tag, output_files in self.get_output_files_abspath()\ .items(): # output_files can be None if the connection is empty for output_file, input_files in output_files.items(): if output_file != None and input_files != None: max_level = max( max_level, up_to_dateness_level(output_file)) if max_level == 0: return self.get_step().get_pipeline().states.FINISHED elif max_level == 1: return self.get_step().get_pipeline().states.READY else: return self.get_step().get_pipeline().states.WAITING
[docs] def add_private_info(self, key, value): ''' Add private information to a run. Use this to store data which you will need when the run is executed. As opposed to public information, private information is not visible to subsequent steps. You can store paths to input files here, but not paths to output files as their expected location is not defined until we're in *AbstractStep.execute* (hint: they get written to a temporary directory inside *execute()*). ''' if key in self._private_info and value != self._private_info[key]: logger.error( "You're trying to overwrite private info %s with %s, " "but there's already a different value stored: %s." % (key, value, self._private_info[key])) sys.exit(1) self._private_info[key] = value
[docs] def add_public_info(self, key, value): ''' Add public information to a run. For example, a FASTQ reader may store the index barcode here for subsequent steps to query via ``AbstractStep.find_upstream_info()``. ''' if key in self._public_info and value != self._public_info[key]: logger.error( "You're trying to overwrite public info %s with %s, " "but there's already a different value stored: %s." % (key, value, self._public_info[key])) sys.exit(1) self._public_info[key] = value
[docs] def update_public_info(self, key, value): ''' Update public information already existing in a run. For example, all steps which handle FASTQ files want to know how to distinguish between files of read 1 and files of read 2. So each step that provides FASTQ should update this information if the file names are altered. The stored information can be acquired via: ``AbstractStep.find_upstream_info()``. ''' if not key in self._public_info: logger.error("The key %s doesn't exist yet as public info." "Please use add_public_info(%s, %s)" % (key, key, value)) sys.exit(1) else: self._public_info[key] = value
[docs] def add_output_file(self, tag, out_path, in_paths): ''' Add an output file to this run. Output file names must be unique across all runs defined by a step, so it may be a good idea to include the run_id into the output filename. - *tag*: You must specify the connection annotation which must have been previously declared via *AbstractStep.add_connection("out/...")*, but this doesn't have to be done in the step constructor, it's also possible in *declare_runs()* right before this method is called. - *out_path*: The output file path, without a directory. The pipeline assigns directories for you (this parameter must not contain a slash). - *in_paths*: A list of input files this output file depends on. It is **crucial** to get this right, so that the pipeline can determine which steps are up-to-date at any given time. You have to specify absolute paths here, including a directory, and you can obtain them via *AbstractStep.run_ids_and_input_files_for_connection* and related functions. ''' head, tail = os.path.split(out_path) # make sure there's no slash in out_path unless it's a source step if head != "" and not \ isinstance(self._step, abst.AbstractSourceStep): logger.error("The declared output file path contains " "directory separator: %s." % out_path) sys.exit(1) # make sure tag was declared with an outgoing connection if 'out/' + tag not in self._step._connections: logger.error("Invalid output_file tag '%s' in %s. " "You might want to add self.add_connection('out/%s') " "to the constructor of %s." % (tag, str(self._step), tag, self._step.__module__)) sys.exit(1) out_connection = self.get_out_connection(tag) if out_path in self.get_output_files_for_out_connection(out_connection): logger.error( "You're trying to re-add an output file which has already " "been declared: %s." % out_path) sys.exit(1) if not isinstance(in_paths, list): logger.error("Input paths (%s) is not a list." % in_paths) sys.exit(1) if None in in_paths: logger.error( "There is a NoneType element in input paths (%s) for output " "file (%s)" % (in_paths, out_path)) sys.exit(1) if out_path == None: logger.error( "Trying to add NoneType element as output file for input paths " ": %s" % in_paths) sys.exit(1) self._input_files.append(in_paths) self._output_files[out_connection][out_path] = in_paths return_value = os.path.join( self.get_output_directory_du_jour_placeholder(), out_path) if head != "": return_value = os.path.abspath(out_path) return return_value
@replace_output_dir_du_jour def add_temporary_file(self, prefix = '', suffix = '', designation = None): ''' Returns the name of a temporary file (created by tempfile library). Name and output directory placeholder are concatenated. The concatenated string is returned and stored in a list. The placeholder is immediately properly adjusted by @replace_output_dir_du_jour. ''' count = len(self._temp_paths) temp_placeholder = str() while True: hashtag = misc.str_to_sha1_b62('%s.%s.%s' % (prefix, count, suffix)) temp_name = prefix + hashtag + suffix temp_placeholder = os.path.join( self.get_output_directory_du_jour_placeholder(), temp_name) if not temp_placeholder in self._temp_paths: break else: count += 1 logger.info("Temporary file (#%s): %s" % (len(self._temp_paths) + 1, temp_name) ) # _known_paths dict is logged known_paths = dict() known_paths[temp_placeholder] = { 'label': os.path.basename(temp_placeholder), 'designation': designation, 'type': '' } self.add_known_paths(known_paths) # _temp_paths list contains all temporary files which are going to be # deleted self._temp_paths.append(temp_placeholder) return temp_placeholder
[docs] def add_temporary_directory(self, prefix = '', suffix = '', designation = None ): ''' Convenience method for creation of temporary directories. Basically, just calls self.add_temporary_file(). The magic happens in ProcessPool.__exit__() ''' return self.add_temporary_file(prefix = prefix, suffix = suffix, designation = designation)
[docs] def remove_temporary_paths(self): ''' Everything stored in self._temp_paths is examined and deleted if possible. The list elements are removed in LIFO order. Also, self._known_paths 'type' info is updated here. NOTE: Included additional stat checks to detect FIFOs as well as other special files. ''' for _ in self.get_temp_paths()[::-1]: # Check file type pathmode = os.stat(_).st_mode isdir = False if stat.S_ISDIR(pathmode) == 0 else True ischaracter = False if stat.S_ISCHR(pathmode) == 0 else True isblock = False if stat.S_ISBLK(pathmode) == 0 else True isfile = False if stat.S_ISREG(pathmode) == 0 else True isfifo = False if stat.S_ISFIFO(pathmode) == 0 else True islink = False if stat.S_ISLNK(pathmode) == 0 else True issock = False if stat.S_ISSOCK(pathmode) == 0 else True # Update 'type' value if _ in self.get_known_paths().keys(): if isfile: logger.debug("Set %s 'type' info to 'file'" % _) self.get_known_paths()[_]['type'] = 'file' elif isdir: logger.debug("Set %s 'type' info to 'directory'" % _) self.get_known_paths()[_]['type'] = 'directory' elif isfifo: logger.debug("Set %s 'type' info to 'fifo'" % _) self.get_known_paths()[_]['type'] = 'fifo' if os.path.isdir(_) and isdir: try: logger.info("Now deleting directory: %s" % _) os.rmdir(_) except OSError as e: logger.error("errno: %s" % e.errno) logger.error("strerror: %s" % e.strerror) logger.error("filename: %s" % e.filename) pass else: try: logger.info("Now deleting: %s" % _) os.unlink(_) except OSError as e: pass
[docs] def add_empty_output_connection(self, tag): ''' An empty output connection has 'None' as output file and 'None' as input file. ''' # make sure tag was declared with an outgoing connection if 'out/' + tag not in self._step._connections: logger.error("Invalid output_file tag '%s' in %s. " "You might want to add self.add_connection('out/%s') " "to the constructor of %s." % (tag, str(self._step), tag, self._step.__module__)) sys.exit(1) try: out_connection = self.get_out_connection(tag) except KeyError: out_connection = self.add_out_connection(tag) if None in self._output_files[out_connection]: logger.error( "You're trying to re-declare %s as an empty output connection " % out_connection) sys.exit(1) self._output_files[out_connection][None] = None
def add_out_connection(self, out_connection): if not out_connection.startswith("out/"): out_connection = 'out/' + out_connection self._output_files[out_connection] = dict() return out_connection def get_input_files_for_output_file(self, output_file): for connection in self.get_out_connections(): if output_file in \ self.get_output_files_for_out_connection(connection): return self._output_files[connection][output_file] def get_input_files_for_output_file_abspath(self, output_file): for connection in self.get_out_connections(): if output_file in \ self.get_output_files_abspath_for_out_connection(connection): return self.get_output_files_abspath()[connection]\ [output_file] def get_output_files_for_out_connection(self, out_connection): return list( self._output_files[out_connection].keys() ) def get_output_files_abspath_for_out_connection(self, out_connection): return sorted( list( self.get_output_files_abspath()[out_connection].keys() ) ) def get_output_files(self): return self._output_files
[docs] def get_output_files_abspath(self): ''' Return a dictionary of all defined output files, grouped by connection annotation:: annotation_1: out_path_1: [in_path_1, in_path_2, ...] out_path_2: ... annotation_2: ... The ``out_path`` consists of the output directory du jour and the output file name. ''' result = dict() for connection in self._output_files.keys(): result[connection] = dict() for out_path, in_paths in self._output_files[connection].items(): directory = self.get_output_directory_du_jour() full_path = out_path try: head, tail = os.path.split(out_path) if directory != None and out_path != None and head == "": full_path = os.path.join(directory, out_path) except AttributeError: pass result[connection][full_path] = in_paths return result
[docs] def get_single_output_file_for_annotation(self, annotation): ''' Retrieve exactly one output file of the given annotation, and crash if there isn't exactly one. ''' temp = self.get_output_files_abspath() if len(temp[annotation]) != 1: logger.error("More than one output file declared for out/%s." % annotation) sys.exit(1) return temp[annotation].keys()[0]
[docs] def get_output_files_for_annotation_and_tags(self, annotation, tags): ''' Retrieve a set of output files of the given annotation, assigned to the same number of specified tags. If you have two 'alignment' output files and they are called *out-a.txt* and *out-b.txt*, you can use this function like this: - *tags*: ['a', 'b'] - result: {'a': 'out-a.txt', 'b': 'out-b.txt'} ''' temp = self.get_output_files_abspath() return misc.assign_strings(temp[annotation].keys(), tags)
[docs] def get_input_files_for_output_file(self, out_path): ''' Return all input files a given output file depends on. ''' temp = self.get_output_files() for tag in temp.keys(): if out_path in temp[tag].keys(): return sorted(temp[tag][out_path]) logger.error("Sorry, your output '%s' file couldn't be found in" "the dictionary: %s." % (out_path, temp)) sys.exit(1)
[docs] def get_public_info(self, key): ''' Query public information which must have been previously stored via " "*add_public_info()*. ''' return self._public_info[key]
[docs] def has_public_info(self, key): ''' Query whether a piece of public information has been defined. ''' return (key in self._public_info)
[docs] def get_private_info(self, key): ''' Query private information which must have been previously stored via " "*add_private_info()*. ''' return self._private_info[key]
[docs] def has_private_info(self, key): ''' Query whether a piece of public information has been defined. ''' return (key in self._private_info)
def as_dict(self): result = dict() result['output_directory'] = self.get_output_directory() result['output_files'] = self._output_files result['private_info'] = self._private_info result['public_info'] = self._public_info result['run_id'] = self._run_id return result
[docs] def write_annotation_file(self, path): ''' Write the YAML annotation after a successful or failed run. The annotation can later be used to render the process graph. ''' # now write the annotation log = {} log['pid'] = os.getpid() log['step'] = {} log['step']['options'] = self.get_step().get_options() log['step']['name'] = self.get_step().get_step_name() log['step']['cores'] = self.get_step()._cores log['run'] = {} log['run']['run_info'] = self.as_dict() log['run']['run_id'] = self.get_run_id() log['run']['temp_directory'] = self.get_temp_output_directory() # if a submit script was used ... if os.path.exists(self.get_submit_script_file()): # ... read it and store it ... with open(self.get_submit_script_file(), 'r') as f: log['run']['submit_script'] = f.read() # ... finally delete it os.unlink(self.get_submit_script_file()) log['run']['known_paths'] = self.get_known_paths() log['config'] = self.get_step().get_pipeline().config log['git_hash_tag'] = self.get_step().get_pipeline().git_hash_tag log['tool_versions'] = {} for tool in self.get_step()._tools.keys(): log['tool_versions'][tool] = self.get_step().get_pipeline()\ .tool_versions[tool] log['pipeline_log'] = self.get_step()._pipeline_log log['start_time'] = self.get_step().start_time log['end_time'] = self.get_step().end_time if self.get_step().get_pipeline().git_dirty_diff: log['git_dirty_diff'] = self.get_step().get_pipeline().git_dirty_diff if self.get_step().get_pipeline().caught_signal is not None: log['signal'] = self.get_step().get_pipeline().caught_signal annotation_yaml = yaml.dump(log, default_flow_style = False) annotation_path = os.path.join( path, ".%s-annotation-%s.yaml" % (self.get_run_id(), misc.str_to_sha1_b62(annotation_yaml)[:6])) # overwrite the annotation if it already exists with open(annotation_path, 'w') as f: f.write(annotation_yaml) return annotation_path, annotation_yaml