"""Functionality for converting a Format 2 workflow into a standard Galaxy workflow."""
import argparse
import copy
import json
import logging
import os
import sys
import uuid
from typing import Any, Dict, Optional
from ._labels import Labels
from .model import (
convert_dict_to_id_list_if_needed,
ensure_step_position,
inputs_as_native_steps,
with_step_ids,
)
from .yaml import ordered_load
SCRIPT_DESCRIPTION = """
Convert a Format 2 Galaxy workflow description into a native format.
"""
# source: step#output and $link: step#output instead of outputSource: step/output and $link: step/output
SUPPORT_LEGACY_CONNECTIONS = os.environ.get("GXFORMAT2_SUPPORT_LEGACY_CONNECTIONS") == "1"
STEP_TYPES = [
"subworkflow",
"data_input",
"data_collection_input",
"tool",
"pause",
"parameter_input",
]
STEP_TYPE_ALIASES = {
'input': 'data_input',
'input_collection': 'data_collection_input',
'parameter': 'parameter_input',
}
RUN_ACTIONS_TO_STEPS = {
'GalaxyWorkflow': 'run_workflow_to_step',
'GalaxyTool': 'run_tool_to_step',
}
POST_JOB_ACTIONS = {
'hide': {
'action_class': "HideDatasetAction",
'default': False,
'arguments': lambda x: x,
},
'rename': {
'action_class': 'RenameDatasetAction',
'default': {},
'arguments': lambda x: {'newname': x},
},
'delete_intermediate_datasets': {
'action_class': 'DeleteIntermediatesAction',
'default': False,
'arguments': lambda x: x,
},
'change_datatype': {
'action_class': 'ChangeDatatypeAction',
'default': {},
'arguments': lambda x: {'newtype': x},
},
'set_columns': {
'action_class': 'ColumnSetAction',
'default': {},
'arguments': lambda x: x,
},
'add_tags': {
'action_class': 'TagDatasetAction',
'default': [],
'arguments': lambda x: {'tags': ",".join(x)},
},
'remove_tags': {
'action_class': 'RemoveTagDatasetAction',
'default': [],
'arguments': lambda x: {'tags': ",".join(x)},
},
}
log = logging.getLogger(__name__)
def rename_arg(argument):
return argument
def clean_connection(value):
if value and "#" in value and SUPPORT_LEGACY_CONNECTIONS:
# Hope these are just used by Galaxy testing workflows and such, and not in production workflows.
log.warn(f"Legacy workflow syntax for connections [{value}] will not be supported in the future")
value = value.replace("#", "/", 1)
else:
return value
[docs]class ImportOptions:
def __init__(self):
self.deduplicate_subworkflows = False
[docs]def yaml_to_workflow(has_yaml, galaxy_interface, workflow_directory, import_options=None):
"""Convert a Format 2 workflow into standard Galaxy format from supplied stream."""
as_python = ordered_load(has_yaml)
return python_to_workflow(as_python, galaxy_interface, workflow_directory, import_options=import_options)
[docs]def python_to_workflow(as_python, galaxy_interface, workflow_directory=None, import_options=None):
"""Convert a Format 2 workflow into standard Galaxy format from supplied dictionary."""
if "yaml_content" in as_python:
as_python = ordered_load(as_python["yaml_content"])
if workflow_directory is None:
workflow_directory = os.path.abspath(".")
conversion_context = ConversionContext(
galaxy_interface,
workflow_directory,
import_options,
)
as_python = _preprocess_graphs(as_python, conversion_context)
subworkflows = None
if conversion_context.import_options.deduplicate_subworkflows:
# TODO: import only required workflows...
# TODO: dag sort these...
subworkflows = {}
for graph_id, subworkflow_content in conversion_context.graph_ids.items():
if graph_id == "main":
continue
subworkflow_conversion_context = conversion_context.get_subworkflow_conversion_context_graph("#" + graph_id)
subworkflows[graph_id] = _python_to_workflow(copy.deepcopy(subworkflow_content), subworkflow_conversion_context)
converted = _python_to_workflow(as_python, conversion_context)
if subworkflows is not None:
converted["subworkflows"] = subworkflows
return converted
# move to a utils file?
def steps_as_list(format2_workflow: dict, add_ids: bool = False, inputs_offset: int = 0, mutate: bool = False):
"""Return steps as a list, converting ID map to list representation if needed.
This method does mutate the supplied steps, try to make progress toward not doing this.
Add keys as labels instead of IDs. Why am I doing this?
"""
if "steps" not in format2_workflow:
raise Exception(f"No 'steps' key in dict, keys are {format2_workflow.keys()}")
steps = format2_workflow["steps"]
steps = convert_dict_to_id_list_if_needed(steps, add_label=True, mutate=mutate)
if add_ids:
if mutate:
_append_step_id_to_step_list_elements(steps, inputs_offset=inputs_offset)
else:
steps = with_step_ids(steps, inputs_offset=inputs_offset)
return steps
def _append_step_id_to_step_list_elements(steps: list, inputs_offset: int = 0):
assert isinstance(steps, list)
for i, step in enumerate(steps):
if "id" not in step:
step["id"] = i + inputs_offset
assert step["id"] is not None
def _python_to_workflow(as_python, conversion_context):
if "class" not in as_python:
raise Exception("This is not a not a valid Galaxy workflow definition, must define a class.")
if as_python["class"] != "GalaxyWorkflow":
raise Exception("This is not a not a valid Galaxy workflow definition, 'class' must be 'GalaxyWorkflow'.")
# .ga files don't have this, drop it so it isn't interpreted as a format 2 workflow.
as_python.pop("class")
_ensure_defaults(as_python, {
"a_galaxy_workflow": "true",
"format-version": "0.1",
"name": as_python.pop("label", "Workflow"),
"uuid": str(uuid.uuid4()),
})
_populate_annotation(as_python)
steps = steps_as_list(as_python, mutate=True)
convert_inputs_to_steps(as_python, steps)
if isinstance(steps, list):
_append_step_id_to_step_list_elements(steps)
steps_as_dict: Dict[str, Any] = {}
for i, step in enumerate(steps):
steps_as_dict[str(i)] = step
if "label" in step:
label = step["label"]
conversion_context.labels[label] = i
# TODO: this really should be optional in Galaxy API.
ensure_step_position(step, i)
as_python["steps"] = steps_as_dict
steps = steps_as_dict
for step in steps.values():
step_type = step.get("type", None)
if "run" in step:
if step_type is not None:
raise Exception("Steps specified as run actions cannot specify a type.")
run_action = step.get("run")
run_action = conversion_context.get_runnable_description(run_action)
if isinstance(run_action, dict):
run_class = run_action["class"]
run_to_step_function = eval(RUN_ACTIONS_TO_STEPS[run_class])
run_to_step_function(conversion_context, step, run_action)
else:
step["content_id"] = run_action
step["type"] = "subworkflow"
del step["run"]
for step in steps.values():
step_type = step.get("type", "tool")
step_type = STEP_TYPE_ALIASES.get(step_type, step_type)
if step_type not in STEP_TYPES:
raise Exception(f"Unknown step type encountered {step_type}")
step["type"] = step_type
eval(f"transform_{step_type}")(conversion_context, step)
outputs = as_python.pop("outputs", [])
outputs = convert_dict_to_id_list_if_needed(outputs)
for output in outputs:
assert isinstance(output, dict), "Output definition must be dictionary"
assert "source" in output or "outputSource" in output, "Output definition must specify source"
if "label" in output and "id" in output:
raise Exception("label and id are aliases for outputs, may only define one")
if "label" not in output and "id" not in output:
label = ""
raw_label = output.pop("label", None)
raw_id = output.pop("id", None)
label = raw_label or raw_id
if Labels.is_anonymous_output_label(label):
label = None
source = clean_connection(output.get("outputSource"))
if source is None and SUPPORT_LEGACY_CONNECTIONS:
source = output.get("source").replace("#", "/", 1)
id, output_name = conversion_context.step_output(source)
step = steps[str(id)]
workflow_output = {
"output_name": output_name,
"label": label,
"uuid": output.get("uuid", None)
}
if "workflow_outputs" not in step:
step["workflow_outputs"] = []
step["workflow_outputs"].append(workflow_output)
return as_python
def _preprocess_graphs(as_python, conversion_context):
if not isinstance(as_python, dict):
raise Exception("This is not a not a valid Galaxy workflow definition.")
format_version = as_python.get("format-version", "v2.0")
assert format_version == "v2.0"
if "class" not in as_python and "$graph" in as_python:
for subworkflow in as_python["$graph"]:
if not isinstance(subworkflow, dict):
raise Exception("Malformed workflow content in $graph")
if "id" not in subworkflow:
raise Exception("No subworkflow ID found for entry in $graph.")
subworkflow_id = subworkflow["id"]
if subworkflow_id == "main":
as_python = subworkflow
conversion_context.register_runnable(subworkflow)
return as_python
def convert_inputs_to_steps(workflow_dict: dict, steps: list):
"""Convert workflow inputs to a steps in array - like in native Galaxy.
workflow_dict is a Format 2 representation of a workflow and steps is a
list of steps. This method will prepend all the inputs as as steps to the
steps list. This method modifies both workflow_dict and steps.
"""
if "inputs" not in workflow_dict:
return
input_steps = inputs_as_native_steps(workflow_dict)
workflow_dict.pop("inputs")
for i, new_step in enumerate(input_steps):
steps.insert(i, new_step)
def run_workflow_to_step(conversion_context, step, run_action):
step["type"] = "subworkflow"
if conversion_context.import_options.deduplicate_subworkflows and _is_graph_id_reference(run_action):
step["content_id"] = run_action
else:
subworkflow_conversion_context = conversion_context.get_subworkflow_conversion_context(step)
step["subworkflow"] = _python_to_workflow(
copy.deepcopy(run_action),
subworkflow_conversion_context,
)
def _is_graph_id_reference(run_action):
return run_action and not isinstance(run_action, dict)
def transform_data_input(context, step):
transform_input(context, step, default_name="Input dataset")
def transform_data_collection_input(context, step):
transform_input(context, step, default_name="Input dataset collection")
def transform_parameter_input(context, step):
transform_input(context, step, default_name="input_parameter")
def transform_input(context, step, default_name):
default_name = step.get("label", default_name)
_populate_annotation(step)
_ensure_inputs_connections(step)
if "inputs" not in step:
step["inputs"] = [{}]
step_inputs = step["inputs"][0]
if "name" in step_inputs:
name = step_inputs["name"]
else:
name = default_name
_ensure_defaults(step_inputs, {
"name": name,
"description": "",
})
tool_state = {
"name": name
}
for attrib in ["collection_type", "parameter_type", "optional", "default", "format", "restrictions", "restrictOnConnections", "suggestions"]:
if attrib in step:
tool_state[attrib] = step[attrib]
_populate_tool_state(step, tool_state)
def transform_pause(context, step, default_name="Pause for dataset review"):
default_name = step.get("label", default_name)
_populate_annotation(step)
_ensure_inputs_connections(step)
if "inputs" not in step:
step["inputs"] = [{}]
step_inputs = step["inputs"][0]
if "name" in step_inputs:
name = step_inputs["name"]
else:
name = default_name
_ensure_defaults(step_inputs, {
"name": name,
})
tool_state = {
"name": name
}
connect = _init_connect_dict(step)
_populate_input_connections(context, step, connect)
_populate_tool_state(step, tool_state)
def transform_subworkflow(context, step):
if "when" in step and "source" in step["when"]:
step_id, output_name = context.step_output(step["when"]["source"])
step["when"]["source"] = {"id": step_id, "output_name": output_name}
_populate_annotation(step)
_ensure_inputs_connections(step)
tool_state = {
}
connect = _init_connect_dict(step)
_populate_input_connections(context, step, connect)
_populate_tool_state(step, tool_state)
def _runtime_value():
return {"__class__": "RuntimeValue"}
def transform_tool(context, step):
if "tool_id" not in step:
raise Exception("Tool steps must define a tool_id.")
_ensure_defaults(step, {
"name": step['tool_id'],
"post_job_actions": {},
"tool_version": None,
})
post_job_actions = step["post_job_actions"]
_populate_annotation(step)
tool_state = {
# TODO: Galaxy should not require tool state actually specify a __page__.
"__page__": 0,
}
connect = _init_connect_dict(step)
def append_link(key, value):
if key not in connect:
connect[key] = []
assert "$link" in value
link_value = value["$link"]
connect[key].append(clean_connection(link_value))
def replace_links(value, key=""):
if _is_link(value):
append_link(key, value)
# Filled in by the connection, so to force late
# validation of the field just mark as RuntimeValue.
# It would be better I guess if this were some other
# value dedicated to this purpose (e.g. a ficitious
# {"__class__": "ConnectedValue"}) that could be further
# validated by Galaxy.
return _runtime_value()
if isinstance(value, dict):
new_values = {}
for k, v in value.items():
new_key = _join_prefix(key, k)
new_values[k] = replace_links(v, new_key)
return new_values
elif isinstance(value, list):
new_values = []
for i, v in enumerate(value):
# If we are a repeat we need to modify the key
# but not if values are actually $links.
if _is_link(v):
append_link(key, v)
new_values.append(None)
else:
new_key = "%s_%d" % (key, i)
new_values.append(replace_links(v, new_key))
return new_values
else:
return value
# TODO: handle runtime inputs and state together.
runtime_inputs = step.get("runtime_inputs", [])
if "state" in step or runtime_inputs:
step_state = step.pop("state", {})
step_state = replace_links(step_state)
for key, value in step_state.items():
tool_state[key] = json.dumps(value)
for runtime_input in runtime_inputs:
tool_state[runtime_input] = json.dumps(_runtime_value())
elif "tool_state" in step:
tool_state.update(step.get("tool_state"))
# Fill in input connections
_populate_input_connections(context, step, connect)
_populate_tool_state(step, tool_state)
# Handle outputs.
out = step.pop("out", None)
if out is None:
# Handle LEGACY 19.XX outputs key.
out = step.pop("outputs", [])
out = convert_dict_to_id_list_if_needed(out)
for output in out:
name = output["id"]
for action_key, action_dict in POST_JOB_ACTIONS.items():
action_argument = output.get(action_key, action_dict['default'])
if action_argument:
action_class = action_dict['action_class']
action_name = action_class + name
action = _action(
action_class,
name,
arguments=action_dict['arguments'](action_argument)
)
post_job_actions[action_name] = action
def run_tool_to_step(conversion_context, step, run_action):
tool_description = conversion_context.galaxy_interface.import_tool(
run_action
)
step["type"] = "tool"
step["tool_id"] = tool_description["tool_id"]
step["tool_version"] = tool_description["tool_version"]
step["tool_hash"] = tool_description.get("tool_hash")
step["tool_uuid"] = tool_description.get("uuid")
class BaseConversionContext:
def __init__(self):
self.labels = {}
self.subworkflow_conversion_contexts = {}
def step_id(self, label_or_id):
if label_or_id in self.labels:
id_ = self.labels[label_or_id]
else:
id_ = label_or_id
return int(id_)
def step_output(self, value):
value_parts = str(value).split("/")
if len(value_parts) == 1:
value_parts.append("output")
id = self.step_id(value_parts[0])
return id, value_parts[1]
def get_subworkflow_conversion_context(self, step):
# TODO: sometimes this method takes format2 steps and some times converted native ones
# (for input connections) - redo this so the type signature is stronger.
step_id = step.get("id")
run_action = step.get("run")
if self.import_options.deduplicate_subworkflows and _is_graph_id_reference(run_action):
subworkflow_conversion_context = self.get_subworkflow_conversion_context_graph(run_action)
return subworkflow_conversion_context
if "content_id" in step:
subworkflow_conversion_context = self.get_subworkflow_conversion_context_graph(step["content_id"])
return subworkflow_conversion_context
if step_id not in self.subworkflow_conversion_contexts:
subworkflow_conversion_context = SubworkflowConversionContext(
self
)
self.subworkflow_conversion_contexts[step_id] = subworkflow_conversion_context
return self.subworkflow_conversion_contexts[step_id]
def get_runnable_description(self, run_action):
if "@import" in run_action:
if len(run_action) > 1:
raise Exception("@import must be only key if present.")
run_action_path = run_action["@import"]
runnable_path = os.path.join(self.workflow_directory, run_action_path)
with open(runnable_path) as f:
runnable_description = ordered_load(f)
run_action = runnable_description
if not self.import_options.deduplicate_subworkflows and _is_graph_id_reference(run_action):
run_action = self.graph_ids[run_action[1:]]
return run_action
class ConversionContext(BaseConversionContext):
def __init__(self, galaxy_interface, workflow_directory, import_options: Optional[ImportOptions] = None):
super().__init__()
self.import_options = import_options or ImportOptions()
self.graph_ids: Dict[str, Any] = {}
self.graph_id_subworkflow_conversion_contexts: Dict[str, Any] = {}
self.workflow_directory = workflow_directory
self.galaxy_interface = galaxy_interface
def register_runnable(self, run_action):
assert "id" in run_action
self.graph_ids[run_action["id"]] = run_action
def get_subworkflow_conversion_context_graph(self, graph_id):
if graph_id not in self.graph_id_subworkflow_conversion_contexts:
subworkflow_conversion_context = SubworkflowConversionContext(
self
)
self.graph_id_subworkflow_conversion_contexts[graph_id] = subworkflow_conversion_context
return self.graph_id_subworkflow_conversion_contexts[graph_id]
class SubworkflowConversionContext(BaseConversionContext):
def __init__(self, parent_context):
super().__init__()
self.parent_context = parent_context
@property
def graph_ids(self):
return self.parent_context.graph_ids
@property
def workflow_directory(self):
return self.parent_context.workflow_directory
@property
def import_options(self):
return self.parent_context.import_options
@property
def galaxy_interface(self):
return self.parent_context.galaxy_interface
def get_subworkflow_conversion_context_graph(self, graph_id):
return self.parent_context.get_subworkflow_conversion_context_graph(graph_id)
def _action(type, name, arguments):
return {
"action_arguments": arguments,
"action_type": type,
"output_name": name,
}
def _is_link(value):
return isinstance(value, dict) and "$link" in value
def _join_prefix(prefix, key):
if prefix:
new_key = f"{prefix}|{key}"
else:
new_key = key
return new_key
def _init_connect_dict(step):
if "connect" not in step:
step["connect"] = {}
connect = step["connect"]
del step["connect"]
# handle CWL-style in dict connections.
if "in" in step:
step_in = step["in"]
assert isinstance(step_in, dict)
connection_keys = set()
for key, value in step_in.items():
# TODO: this can be a list right?
if isinstance(value, dict) and 'source' in value:
value = value["source"]
elif isinstance(value, dict) and 'default' in value:
continue
elif isinstance(value, dict):
raise KeyError(f'step input must define either source or default {value}')
connect[key] = [value]
connection_keys.add(key)
for key in connection_keys:
del step_in[key]
if len(step_in) == 0:
del step['in']
return connect
def _populate_input_connections(context, step, connect):
_ensure_inputs_connections(step)
input_connections = step["input_connections"]
is_subworkflow_step = step.get("type") == "subworkflow"
for key, values in connect.items():
input_connection_value = []
if not isinstance(values, list):
values = [values]
for value in values:
if not isinstance(value, dict):
if key == "$step":
value += "/__NO_INPUT_OUTPUT_NAME__"
if not isinstance(value, list):
value = [value]
for source in value:
step_id, output_name = context.step_output(source)
source = {"id": step_id, "output_name": output_name}
if is_subworkflow_step:
subworkflow_conversion_context = context.get_subworkflow_conversion_context(step)
if key in subworkflow_conversion_context.labels:
input_subworkflow_step_id = subworkflow_conversion_context.step_id(key)
source["input_subworkflow_step_id"] = input_subworkflow_step_id
input_connection_value.append(source)
if key == "$step":
key = "__NO_INPUT_OUTPUT_NAME__"
input_connections[key] = input_connection_value
def _populate_annotation(step):
if "annotation" not in step and "doc" in step:
annotation = step.pop("doc")
step["annotation"] = annotation
elif "annotation" not in step:
step["annotation"] = ""
def _ensure_inputs_connections(step):
if "input_connections" not in step:
step["input_connections"] = {}
def _ensure_defaults(in_dict, defaults):
for key, value in defaults.items():
if key not in in_dict:
in_dict[key] = value
def _populate_tool_state(step, tool_state):
step["tool_state"] = json.dumps(tool_state)
[docs]def main(argv=None):
"""Entry point for script to conversion from Format 2 interface."""
if argv is None:
argv = sys.argv[1:]
args = _parser().parse_args(argv)
format2_path = args.input_path
output_path = args.output_path or (format2_path + ".gxwf.yml")
workflow_directory = os.path.abspath(format2_path)
galaxy_interface = None
with open(format2_path) as f:
has_workflow = ordered_load(f)
output = python_to_workflow(has_workflow, galaxy_interface=galaxy_interface, workflow_directory=workflow_directory)
with open(output_path, "w") as f:
json.dump(output, f, indent=4)
def _parser():
parser = argparse.ArgumentParser(description=SCRIPT_DESCRIPTION)
parser.add_argument('input_path', metavar='INPUT', type=str,
help='input workflow path (.ga)')
parser.add_argument('output_path', metavar='OUTPUT', type=str, nargs="?",
help='output workflow path (.gxfw.yml)')
return parser
if __name__ == "__main__":
main(sys.argv)
__all__ = (
'main',
'python_to_workflow',
'yaml_to_workflow',
)