Source code for gxformat2.converter

"""Functionality for converting a Format 2 workflow into a standard Galaxy workflow."""

import argparse
import copy
import json
import logging
import os
import sys
import uuid
from typing import Any, Dict, Optional

from ._labels import Labels
from .model import (
    convert_dict_to_id_list_if_needed,
    ensure_step_position,
    inputs_as_native_steps,
    with_step_ids,
)
from .yaml import ordered_load

SCRIPT_DESCRIPTION = """
Convert a Format 2 Galaxy workflow description into a native format.
"""

# source: step#output and $link: step#output instead of outputSource: step/output and $link: step/output
SUPPORT_LEGACY_CONNECTIONS = os.environ.get("GXFORMAT2_SUPPORT_LEGACY_CONNECTIONS") == "1"
STEP_TYPES = [
    "subworkflow",
    "data_input",
    "data_collection_input",
    "tool",
    "pause",
    "parameter_input",
]

STEP_TYPE_ALIASES = {
    'input': 'data_input',
    'input_collection': 'data_collection_input',
    'parameter': 'parameter_input',
}

RUN_ACTIONS_TO_STEPS = {
    'GalaxyWorkflow': 'run_workflow_to_step',
    'GalaxyTool': 'run_tool_to_step',
}

POST_JOB_ACTIONS = {
    'hide': {
        'action_class': "HideDatasetAction",
        'default': False,
        'arguments': lambda x: x,
    },
    'rename': {
        'action_class': 'RenameDatasetAction',
        'default': {},
        'arguments': lambda x: {'newname': x},
    },
    'delete_intermediate_datasets': {
        'action_class': 'DeleteIntermediatesAction',
        'default': False,
        'arguments': lambda x: x,
    },
    'change_datatype': {
        'action_class': 'ChangeDatatypeAction',
        'default': {},
        'arguments': lambda x: {'newtype': x},
    },
    'set_columns': {
        'action_class': 'ColumnSetAction',
        'default': {},
        'arguments': lambda x: x,
    },
    'add_tags': {
        'action_class': 'TagDatasetAction',
        'default': [],
        'arguments': lambda x: {'tags': ",".join(x)},
    },
    'remove_tags': {
        'action_class': 'RemoveTagDatasetAction',
        'default': [],
        'arguments': lambda x: {'tags': ",".join(x)},
    },
}

log = logging.getLogger(__name__)


def rename_arg(argument):
    return argument


def clean_connection(value):
    if value and "#" in value and SUPPORT_LEGACY_CONNECTIONS:
        # Hope these are just used by Galaxy testing workflows and such, and not in production workflows.
        log.warn(f"Legacy workflow syntax for connections [{value}] will not be supported in the future")
        value = value.replace("#", "/", 1)
    else:
        return value


[docs]class ImportOptions:

    def __init__(self):
        self.deduplicate_subworkflows = False


[docs]def yaml_to_workflow(has_yaml, galaxy_interface, workflow_directory, import_options=None):
    """Convert a Format 2 workflow into standard Galaxy format from supplied stream."""
    as_python = ordered_load(has_yaml)
    return python_to_workflow(as_python, galaxy_interface, workflow_directory, import_options=import_options)


[docs]def python_to_workflow(as_python, galaxy_interface, workflow_directory=None, import_options=None):
    """Convert a Format 2 workflow into standard Galaxy format from supplied dictionary."""
    if "yaml_content" in as_python:
        as_python = ordered_load(as_python["yaml_content"])

    if workflow_directory is None:
        workflow_directory = os.path.abspath(".")

    conversion_context = ConversionContext(
        galaxy_interface,
        workflow_directory,
        import_options,
    )
    as_python = _preprocess_graphs(as_python, conversion_context)
    subworkflows = None
    if conversion_context.import_options.deduplicate_subworkflows:
        # TODO: import only required workflows...
        # TODO: dag sort these...
        subworkflows = {}
        for graph_id, subworkflow_content in conversion_context.graph_ids.items():
            if graph_id == "main":
                continue
            subworkflow_conversion_context = conversion_context.get_subworkflow_conversion_context_graph("#" + graph_id)
            subworkflows[graph_id] = _python_to_workflow(copy.deepcopy(subworkflow_content), subworkflow_conversion_context)
    converted = _python_to_workflow(as_python, conversion_context)
    if subworkflows is not None:
        converted["subworkflows"] = subworkflows
    return converted


# move to a utils file?
def steps_as_list(format2_workflow: dict, add_ids: bool = False, inputs_offset: int = 0, mutate: bool = False):
    """Return steps as a list, converting ID map to list representation if needed.

    This method does mutate the supplied steps, try to make progress toward not doing this.

    Add keys as labels instead of IDs. Why am I doing this?
    """
    if "steps" not in format2_workflow:
        raise Exception(f"No 'steps' key in dict, keys are {format2_workflow.keys()}")
    steps = format2_workflow["steps"]
    steps = convert_dict_to_id_list_if_needed(steps, add_label=True, mutate=mutate)
    if add_ids:
        if mutate:
            _append_step_id_to_step_list_elements(steps, inputs_offset=inputs_offset)
        else:
            steps = with_step_ids(steps, inputs_offset=inputs_offset)
    return steps


def _append_step_id_to_step_list_elements(steps: list, inputs_offset: int = 0):
    assert isinstance(steps, list)
    for i, step in enumerate(steps):
        if "id" not in step:
            step["id"] = i + inputs_offset
        assert step["id"] is not None


def _python_to_workflow(as_python, conversion_context):

    if "class" not in as_python:
        raise Exception("This is not a not a valid Galaxy workflow definition, must define a class.")

    if as_python["class"] != "GalaxyWorkflow":
        raise Exception("This is not a not a valid Galaxy workflow definition, 'class' must be 'GalaxyWorkflow'.")

    # .ga files don't have this, drop it so it isn't interpreted as a format 2 workflow.
    as_python.pop("class")

    _ensure_defaults(as_python, {
        "a_galaxy_workflow": "true",
        "format-version": "0.1",
        "name": as_python.pop("label", "Workflow"),
        "uuid": str(uuid.uuid4()),
    })
    _populate_annotation(as_python)

    steps = steps_as_list(as_python, mutate=True)

    convert_inputs_to_steps(as_python, steps)

    if isinstance(steps, list):
        _append_step_id_to_step_list_elements(steps)
        steps_as_dict: Dict[str, Any] = {}
        for i, step in enumerate(steps):
            steps_as_dict[str(i)] = step
            if "label" in step:
                label = step["label"]
                conversion_context.labels[label] = i

            # TODO: this really should be optional in Galaxy API.
            ensure_step_position(step, i)

        as_python["steps"] = steps_as_dict
        steps = steps_as_dict

    for step in steps.values():
        step_type = step.get("type", None)
        if "run" in step:
            if step_type is not None:
                raise Exception("Steps specified as run actions cannot specify a type.")
            run_action = step.get("run")
            run_action = conversion_context.get_runnable_description(run_action)
            if isinstance(run_action, dict):
                run_class = run_action["class"]
                run_to_step_function = eval(RUN_ACTIONS_TO_STEPS[run_class])

                run_to_step_function(conversion_context, step, run_action)
            else:
                step["content_id"] = run_action
                step["type"] = "subworkflow"
            del step["run"]

    for step in steps.values():
        step_type = step.get("type", "tool")
        step_type = STEP_TYPE_ALIASES.get(step_type, step_type)
        if step_type not in STEP_TYPES:
            raise Exception(f"Unknown step type encountered {step_type}")
        step["type"] = step_type
        eval(f"transform_{step_type}")(conversion_context, step)

    outputs = as_python.pop("outputs", [])
    outputs = convert_dict_to_id_list_if_needed(outputs)

    for output in outputs:
        assert isinstance(output, dict), "Output definition must be dictionary"
        assert "source" in output or "outputSource" in output, "Output definition must specify source"

        if "label" in output and "id" in output:
            raise Exception("label and id are aliases for outputs, may only define one")
        if "label" not in output and "id" not in output:
            label = ""

        raw_label = output.pop("label", None)
        raw_id = output.pop("id", None)
        label = raw_label or raw_id
        if Labels.is_anonymous_output_label(label):
            label = None
        source = clean_connection(output.get("outputSource"))
        if source is None and SUPPORT_LEGACY_CONNECTIONS:
            source = output.get("source").replace("#", "/", 1)
        id, output_name = conversion_context.step_output(source)
        step = steps[str(id)]
        workflow_output = {
            "output_name": output_name,
            "label": label,
            "uuid": output.get("uuid", None)
        }
        if "workflow_outputs" not in step:
            step["workflow_outputs"] = []
        step["workflow_outputs"].append(workflow_output)

    return as_python


def _preprocess_graphs(as_python, conversion_context):
    if not isinstance(as_python, dict):
        raise Exception("This is not a not a valid Galaxy workflow definition.")

    format_version = as_python.get("format-version", "v2.0")
    assert format_version == "v2.0"

    if "class" not in as_python and "$graph" in as_python:
        for subworkflow in as_python["$graph"]:
            if not isinstance(subworkflow, dict):
                raise Exception("Malformed workflow content in $graph")
            if "id" not in subworkflow:
                raise Exception("No subworkflow ID found for entry in $graph.")
            subworkflow_id = subworkflow["id"]
            if subworkflow_id == "main":
                as_python = subworkflow

            conversion_context.register_runnable(subworkflow)

    return as_python


def convert_inputs_to_steps(workflow_dict: dict, steps: list):
    """Convert workflow inputs to a steps in array - like in native Galaxy.

    workflow_dict is a Format 2 representation of a workflow and steps is a
    list of steps. This method will prepend all the inputs as as steps to the
    steps list. This method modifies both workflow_dict and steps.
    """
    if "inputs" not in workflow_dict:
        return

    input_steps = inputs_as_native_steps(workflow_dict)
    workflow_dict.pop("inputs")
    for i, new_step in enumerate(input_steps):
        steps.insert(i, new_step)


def run_workflow_to_step(conversion_context, step, run_action):
    step["type"] = "subworkflow"
    if conversion_context.import_options.deduplicate_subworkflows and _is_graph_id_reference(run_action):
        step["content_id"] = run_action
    else:
        subworkflow_conversion_context = conversion_context.get_subworkflow_conversion_context(step)
        step["subworkflow"] = _python_to_workflow(
            copy.deepcopy(run_action),
            subworkflow_conversion_context,
        )


def _is_graph_id_reference(run_action):
    return run_action and not isinstance(run_action, dict)


def transform_data_input(context, step):
    transform_input(context, step, default_name="Input dataset")


def transform_data_collection_input(context, step):
    transform_input(context, step, default_name="Input dataset collection")


def transform_parameter_input(context, step):
    transform_input(context, step, default_name="input_parameter")


def transform_input(context, step, default_name):
    default_name = step.get("label", default_name)
    _populate_annotation(step)
    _ensure_inputs_connections(step)

    if "inputs" not in step:
        step["inputs"] = [{}]

    step_inputs = step["inputs"][0]
    if "name" in step_inputs:
        name = step_inputs["name"]
    else:
        name = default_name

    _ensure_defaults(step_inputs, {
        "name": name,
        "description": "",
    })
    tool_state = {
        "name": name
    }
    for attrib in ["collection_type", "parameter_type", "optional", "default", "format", "restrictions", "restrictOnConnections", "suggestions"]:
        if attrib in step:
            tool_state[attrib] = step[attrib]

    _populate_tool_state(step, tool_state)


def transform_pause(context, step, default_name="Pause for dataset review"):
    default_name = step.get("label", default_name)
    _populate_annotation(step)

    _ensure_inputs_connections(step)

    if "inputs" not in step:
        step["inputs"] = [{}]

    step_inputs = step["inputs"][0]
    if "name" in step_inputs:
        name = step_inputs["name"]
    else:
        name = default_name

    _ensure_defaults(step_inputs, {
        "name": name,
    })
    tool_state = {
        "name": name
    }

    connect = _init_connect_dict(step)
    _populate_input_connections(context, step, connect)
    _populate_tool_state(step, tool_state)


def transform_subworkflow(context, step):
    if "when" in step and "source" in step["when"]:
        step_id, output_name = context.step_output(step["when"]["source"])
        step["when"]["source"] = {"id": step_id, "output_name": output_name}

    _populate_annotation(step)

    _ensure_inputs_connections(step)

    tool_state = {
    }

    connect = _init_connect_dict(step)
    _populate_input_connections(context, step, connect)
    _populate_tool_state(step, tool_state)


def _runtime_value():
    return {"__class__": "RuntimeValue"}


def transform_tool(context, step):
    if "tool_id" not in step:
        raise Exception("Tool steps must define a tool_id.")

    _ensure_defaults(step, {
        "name": step['tool_id'],
        "post_job_actions": {},
        "tool_version": None,
    })
    post_job_actions = step["post_job_actions"]
    _populate_annotation(step)

    tool_state = {
        # TODO: Galaxy should not require tool state actually specify a __page__.
        "__page__": 0,
    }

    connect = _init_connect_dict(step)

    def append_link(key, value):
        if key not in connect:
            connect[key] = []
        assert "$link" in value
        link_value = value["$link"]
        connect[key].append(clean_connection(link_value))

    def replace_links(value, key=""):
        if _is_link(value):
            append_link(key, value)
            # Filled in by the connection, so to force late
            # validation of the field just mark as RuntimeValue.
            # It would be better I guess if this were some other
            # value dedicated to this purpose (e.g. a ficitious
            # {"__class__": "ConnectedValue"}) that could be further
            # validated by Galaxy.
            return _runtime_value()
        if isinstance(value, dict):
            new_values = {}
            for k, v in value.items():
                new_key = _join_prefix(key, k)
                new_values[k] = replace_links(v, new_key)
            return new_values
        elif isinstance(value, list):
            new_values = []
            for i, v in enumerate(value):
                # If we are a repeat we need to modify the key
                # but not if values are actually $links.
                if _is_link(v):
                    append_link(key, v)
                    new_values.append(None)
                else:
                    new_key = "%s_%d" % (key, i)
                    new_values.append(replace_links(v, new_key))
            return new_values
        else:
            return value

    # TODO: handle runtime inputs and state together.
    runtime_inputs = step.get("runtime_inputs", [])
    if "state" in step or runtime_inputs:
        step_state = step.pop("state", {})
        step_state = replace_links(step_state)

        for key, value in step_state.items():
            tool_state[key] = json.dumps(value)
        for runtime_input in runtime_inputs:
            tool_state[runtime_input] = json.dumps(_runtime_value())
    elif "tool_state" in step:
        tool_state.update(step.get("tool_state"))

    # Fill in input connections
    _populate_input_connections(context, step, connect)

    _populate_tool_state(step, tool_state)

    # Handle outputs.
    out = step.pop("out", None)
    if out is None:
        # Handle LEGACY 19.XX outputs key.
        out = step.pop("outputs", [])
    out = convert_dict_to_id_list_if_needed(out)
    for output in out:
        name = output["id"]
        for action_key, action_dict in POST_JOB_ACTIONS.items():
            action_argument = output.get(action_key, action_dict['default'])
            if action_argument:
                action_class = action_dict['action_class']
                action_name = action_class + name
                action = _action(
                    action_class,
                    name,
                    arguments=action_dict['arguments'](action_argument)
                )
                post_job_actions[action_name] = action


def run_tool_to_step(conversion_context, step, run_action):
    tool_description = conversion_context.galaxy_interface.import_tool(
        run_action
    )
    step["type"] = "tool"
    step["tool_id"] = tool_description["tool_id"]
    step["tool_version"] = tool_description["tool_version"]
    step["tool_hash"] = tool_description.get("tool_hash")
    step["tool_uuid"] = tool_description.get("uuid")


class BaseConversionContext:

    def __init__(self):
        self.labels = {}
        self.subworkflow_conversion_contexts = {}

    def step_id(self, label_or_id):
        if label_or_id in self.labels:
            id_ = self.labels[label_or_id]
        else:
            id_ = label_or_id
        return int(id_)

    def step_output(self, value):
        value_parts = str(value).split("/")
        if len(value_parts) == 1:
            value_parts.append("output")
        id = self.step_id(value_parts[0])
        return id, value_parts[1]

    def get_subworkflow_conversion_context(self, step):
        # TODO: sometimes this method takes format2 steps and some times converted native ones
        # (for input connections) - redo this so the type signature is stronger.
        step_id = step.get("id")
        run_action = step.get("run")
        if self.import_options.deduplicate_subworkflows and _is_graph_id_reference(run_action):
            subworkflow_conversion_context = self.get_subworkflow_conversion_context_graph(run_action)
            return subworkflow_conversion_context
        if "content_id" in step:
            subworkflow_conversion_context = self.get_subworkflow_conversion_context_graph(step["content_id"])
            return subworkflow_conversion_context

        if step_id not in self.subworkflow_conversion_contexts:

            subworkflow_conversion_context = SubworkflowConversionContext(
                self
            )
            self.subworkflow_conversion_contexts[step_id] = subworkflow_conversion_context
        return self.subworkflow_conversion_contexts[step_id]

    def get_runnable_description(self, run_action):
        if "@import" in run_action:
            if len(run_action) > 1:
                raise Exception("@import must be only key if present.")

            run_action_path = run_action["@import"]
            runnable_path = os.path.join(self.workflow_directory, run_action_path)
            with open(runnable_path) as f:
                runnable_description = ordered_load(f)
                run_action = runnable_description

        if not self.import_options.deduplicate_subworkflows and _is_graph_id_reference(run_action):
            run_action = self.graph_ids[run_action[1:]]

        return run_action


class ConversionContext(BaseConversionContext):

    def __init__(self, galaxy_interface, workflow_directory, import_options: Optional[ImportOptions] = None):
        super().__init__()
        self.import_options = import_options or ImportOptions()
        self.graph_ids: Dict[str, Any] = {}
        self.graph_id_subworkflow_conversion_contexts: Dict[str, Any] = {}
        self.workflow_directory = workflow_directory
        self.galaxy_interface = galaxy_interface

    def register_runnable(self, run_action):
        assert "id" in run_action
        self.graph_ids[run_action["id"]] = run_action

    def get_subworkflow_conversion_context_graph(self, graph_id):
        if graph_id not in self.graph_id_subworkflow_conversion_contexts:
            subworkflow_conversion_context = SubworkflowConversionContext(
                self
            )
            self.graph_id_subworkflow_conversion_contexts[graph_id] = subworkflow_conversion_context
        return self.graph_id_subworkflow_conversion_contexts[graph_id]


class SubworkflowConversionContext(BaseConversionContext):

    def __init__(self, parent_context):
        super().__init__()
        self.parent_context = parent_context

    @property
    def graph_ids(self):
        return self.parent_context.graph_ids

    @property
    def workflow_directory(self):
        return self.parent_context.workflow_directory

    @property
    def import_options(self):
        return self.parent_context.import_options

    @property
    def galaxy_interface(self):
        return self.parent_context.galaxy_interface

    def get_subworkflow_conversion_context_graph(self, graph_id):
        return self.parent_context.get_subworkflow_conversion_context_graph(graph_id)


def _action(type, name, arguments):
    return {
        "action_arguments": arguments,
        "action_type": type,
        "output_name": name,
    }


def _is_link(value):
    return isinstance(value, dict) and "$link" in value


def _join_prefix(prefix, key):
    if prefix:
        new_key = f"{prefix}|{key}"
    else:
        new_key = key
    return new_key


def _init_connect_dict(step):
    if "connect" not in step:
        step["connect"] = {}

    connect = step["connect"]
    del step["connect"]

    # handle CWL-style in dict connections.
    if "in" in step:
        step_in = step["in"]
        assert isinstance(step_in, dict)
        connection_keys = set()
        for key, value in step_in.items():
            # TODO: this can be a list right?
            if isinstance(value, dict) and 'source' in value:
                value = value["source"]
            elif isinstance(value, dict) and 'default' in value:
                continue
            elif isinstance(value, dict):
                raise KeyError(f'step input must define either source or default {value}')
            connect[key] = [value]
            connection_keys.add(key)

        for key in connection_keys:
            del step_in[key]

        if len(step_in) == 0:
            del step['in']

    return connect


def _populate_input_connections(context, step, connect):
    _ensure_inputs_connections(step)
    input_connections = step["input_connections"]
    is_subworkflow_step = step.get("type") == "subworkflow"

    for key, values in connect.items():
        input_connection_value = []
        if not isinstance(values, list):
            values = [values]
        for value in values:
            if not isinstance(value, dict):
                if key == "$step":
                    value += "/__NO_INPUT_OUTPUT_NAME__"
                if not isinstance(value, list):
                    value = [value]
                for source in value:
                    step_id, output_name = context.step_output(source)
                    source = {"id": step_id, "output_name": output_name}
                    if is_subworkflow_step:
                        subworkflow_conversion_context = context.get_subworkflow_conversion_context(step)
                        if key in subworkflow_conversion_context.labels:
                            input_subworkflow_step_id = subworkflow_conversion_context.step_id(key)
                            source["input_subworkflow_step_id"] = input_subworkflow_step_id
                    input_connection_value.append(source)
        if key == "$step":
            key = "__NO_INPUT_OUTPUT_NAME__"
        input_connections[key] = input_connection_value


def _populate_annotation(step):
    if "annotation" not in step and "doc" in step:
        annotation = step.pop("doc")
        step["annotation"] = annotation
    elif "annotation" not in step:
        step["annotation"] = ""


def _ensure_inputs_connections(step):
    if "input_connections" not in step:
        step["input_connections"] = {}


def _ensure_defaults(in_dict, defaults):
    for key, value in defaults.items():
        if key not in in_dict:
            in_dict[key] = value


def _populate_tool_state(step, tool_state):
    step["tool_state"] = json.dumps(tool_state)


[docs]def main(argv=None):
    """Entry point for script to conversion from Format 2 interface."""
    if argv is None:
        argv = sys.argv[1:]

    args = _parser().parse_args(argv)

    format2_path = args.input_path
    output_path = args.output_path or (format2_path + ".gxwf.yml")

    workflow_directory = os.path.abspath(format2_path)
    galaxy_interface = None

    with open(format2_path) as f:
        has_workflow = ordered_load(f)

    output = python_to_workflow(has_workflow, galaxy_interface=galaxy_interface, workflow_directory=workflow_directory)
    with open(output_path, "w") as f:
        json.dump(output, f, indent=4)


def _parser():
    parser = argparse.ArgumentParser(description=SCRIPT_DESCRIPTION)
    parser.add_argument('input_path', metavar='INPUT', type=str,
                        help='input workflow path (.ga)')
    parser.add_argument('output_path', metavar='OUTPUT', type=str, nargs="?",
                        help='output workflow path (.gxfw.yml)')
    return parser


if __name__ == "__main__":
    main(sys.argv)


__all__ = (
    'main',
    'python_to_workflow',
    'yaml_to_workflow',
)