Source code for pta.model_assessment

"""Methods for the thermodynamic assessment of a metabolic network.
"""

import logging
from typing import List, Optional, Union

import cobra
import numpy as np
import pandas as pd
from cobra.flux_analysis.variability import flux_variability_analysis

from .commons import Q
from .constants import (
    default_flux_bound,
    default_max_non_intracellular_conc_mM,
    default_theta_s,
    default_theta_z,
    non_intracellular_compartment_ids,
)
from .pmo import PmoProblem
from .utils import (
    find_blocked_reactions_from_fva_solution,
    get_internal_cycles,
    tighten_model_bounds,
)

logger = logging.getLogger(__name__)


[docs]class StructuralAssessment:
    """This class is used to find thermodynamic inconsistencies in the definition of the
    network that make the model infeasible. Inconsistencies arise when flux constraints
    (steady state and irreversibilities) prevent any thermodynamically feasible non-zero
    solution. These inconsistencies occur with any assignment of Gibbs free energies.

    Parameters
    ----------
    model : cobra.Model
        The target model.
    biomass_id : Optional[str], optional
        ID of the biomass reactions. If None, the method will try to find it
        automatically.
    atpm_id : str, optional
        ID of the ATP maintenance reaction.
    """

    def __init__(
        self,
        model: cobra.Model,
        biomass_id: Optional[str] = None,
        atpm_id: str = "ATPM",
    ):
        # Enumerate all the internal cycles of the network and select the ones that must
        # be active in any non-zero solution.
        cycles = get_internal_cycles(model, biomass_id, atpm_id)
        forced_cycles = self._get_forced_internal_cycles(model, cycles)

        # Convert the EFMs matrix to a list of cycles, represented by reaction IDs.
        self._forced_internal_cycles = []
        for cycle in forced_cycles.T:
            cycle_reaction_ids = [
                model.reactions[i].id for i, v in enumerate(cycle) if v != 0
            ]
            self._forced_internal_cycles.append(cycle_reaction_ids)

    @property
[docs]    def forced_internal_cycles(self) -> List[List[str]]:
        """Gets a list of all the forced internal cycles of the network, i.e. the
        internal cycles that must be active in any non-zero flux solution and
        thus make the model thermodynamically inconsistent.
        """
        return self._forced_internal_cycles

[docs]    def summary(self):
        """Print a summary of the structural assessment."""
        print("Strutural thermodynamic assessment summary:")
        print("---------------------------------------------")
        print("")

        _ = pd.option_context(
            "display.max_rows",
            None,
            "display.max_columns",
            None,
            "display.precision",
            3,
        )

        if len(self.forced_internal_cycles) > 0:
            print(
                "> The following internal cycles are thermodynamically "
                "unfeasible, but must be active in any non-zero flux "
                "solution, meaning that the model is thermodynamically "
                "inconsistent. For the cycles below, please:\n (1) Verify and "
                "correct the direction of the irreversible reactions.\n (2) "
                "Remove reactions in cycles that are connected to the rest "
                "of the network through a single reaction (e.g. A -> B, "
                "B -> A, where only A participates in other reactions).\n "
                "Alternatively, you can use the StructuralAssessment."
                "autoresolve() function to automatically curate the model."
            )
            for i, cycle in enumerate(self.forced_internal_cycles):
                print(f'{i}. {", ".join(cycle)}')
        else:
            print("> No thermodynamic inconsistencies detected in the model.")
        print("")

[docs]    def autoresolve(
        self, model: cobra.Model, default_bound: float = default_flux_bound
    ):
        """Attempt to automatically resolve thermodynamic inconsistencies. This method
        removes all reactions involved in dead-end cycles (internal cycles that
        exchange flux with the rest of the network through a single metabolite) and
        makes all the reactions in the remaining cycles reversible. While this method
        should always resolve all inconsistencies, it is still recommended to inspect
        the inconsistencies and the curated model manually to verify that the model
        behave as expected. The output of the curation can be seen by enabling logging
        level INFO.

        Parameters
        ----------
        model : cobra.Model
            The model in which the curation actions must be applied. This can be the
            model on which the assessment was run or a similar model. This is useful for
            cases where assessment is run on a condition specific model but actions
            should be applied to a base model.
        default_bound : float, optional
            Default bound to use for unconstrained reactions.
        """
        for cycle in self.forced_internal_cycles:
            if self._is_semi_disconnected_cycle(model, cycle):
                logger.info(
                    "Removing all reactions of the dead-end cycle:"
                    f' {", ".join(cycle)}.'
                )
                model.remove_reactions(cycle, remove_orphans=True)
            else:
                logger.info(
                    "Removing bounds for all reactions in the cycle: "
                    f'{", ".join(cycle)}.'
                )
                for rxn_id in cycle:
                    try:
                        reaction = model.reactions.get_by_id(rxn_id)
                    except Exception:
                        continue
                    reaction.lower_bound = -default_bound
                    reaction.upper_bound = default_bound

    def _is_semi_disconnected_cycle(self, model: cobra.Model, cycle: np.array) -> bool:
        """Check whether a cycle is dead-end, i.e. it exchanges flux with the
        rest of the network through a single metabolite only.
        """
        with model as model:
            # Pick a reaction and fix it to a flux value that is not at the
            # boundaries.
            try:
                fixed_reaction = model.reactions.get_by_id(cycle[0])
            except Exception:
                # If the reaction does not exist, the cycle is broken and we don't need
                # to do anything else.
                return False
            model.objective = {fixed_reaction: 1}
            max_flux = model.slim_optimize()
            model.objective = {fixed_reaction: -1}
            min_flux = -model.slim_optimize()
            fixed_flux = (max_flux + min_flux) / 2
            fixed_reaction.lower_bound = fixed_flux
            fixed_reaction.upper_bound = fixed_flux

            # If all other reactions of the cycle are not fixed as well, then this is a
            # dead-end cycle.
            for rxn_id in cycle:
                try:
                    other_reaction = model.reactions.get_by_id(rxn_id)
                except Exception:
                    return False
                model.objective = {other_reaction: 1}
                max_flux = model.slim_optimize()
                model.objective = {other_reaction: -1}
                min_flux = -model.slim_optimize()
                if max_flux - min_flux > 1e-8:
                    return False

        return True

    def _get_forced_internal_cycles(
        self, model: cobra.Model, cycles: np.array
    ) -> np.array:
        """Select our of a list of cycles the ones that are forced."""
        forced_cycle_idxs = []
        for e in range(cycles.shape[1]):
            if self._is_forced_cycle(model, cycles[:, e]):
                forced_cycle_idxs.append(e)

        return cycles[:, forced_cycle_idxs]

    def _is_forced_cycle(
        self,
        model: cobra.Model,
        cycle: np.array,
    ) -> bool:
        """Check if a cycle is forced (must be active in any non-zero flux solution)."""
        min_flux = 1e-6
        rxn_idxs = np.where(cycle != 0)[0].tolist()

        # Check for any pair of reaction and for any sense of the cycle.
        for i, _ in enumerate(rxn_idxs):
            for j in range(i):
                for sense in [-1, 1]:
                    with model as model:
                        rxn_i = model.reactions[rxn_idxs[i]]
                        rxn_j = model.reactions[rxn_idxs[j]]
                        direction_i = cycle[rxn_idxs[i]] * sense
                        direction_j = cycle[rxn_idxs[j]] * sense

                        # If the chosen directions are not within the flux bounds of the
                        # model we can skip the iteration.
                        if (
                            (direction_i > 0 and rxn_i.upper_bound <= 0)
                            or (direction_i < 0 and rxn_i.lower_bound >= 0)
                            or (direction_j < 0 and rxn_j.upper_bound <= 0)
                            or (direction_j > 0 and rxn_j.lower_bound >= 0)
                        ):
                            continue

                        # Set the direction of the cycle for the first reaction.
                        if direction_i > 0:
                            rxn_i.lower_bound = max(rxn_i.lower_bound, min_flux)
                        else:
                            rxn_i.upper_bound = min(rxn_i.upper_bound, -min_flux)

                        # Set the opposite direction (with respect to the direction in
                        # the EFM) for the second reaction.
                        if direction_j < 0:
                            rxn_j.lower_bound = max(rxn_j.lower_bound, min_flux)
                        else:
                            rxn_j.upper_bound = min(rxn_j.upper_bound, -min_flux)

                        # Check whether the model is feasible. If yes, there are
                        # thermodynamically feasible solutions for the set of
                        # reactions involved in the EFM because the cycle is not
                        # mandatory.
                        if not np.isnan(model.slim_optimize()):
                            return False

        # If we could not find any thermodynamically feasible combination of directions
        # the cycle is forced.
        return True


[docs]class QuantitativeAssessment:
    """Quantitative thermodynamic assessment of a metabolic network. This process
    combines thermodynamic data and flux constraints to identify parts of the network
    where the mechanism in the model is possibly incorrect.

    Parameters
    ----------
    pmo_problem : PmoProblem
        A solved PMO problem.
    z_score_threshold : float, optional
        Threshold on the z-score to consider a predicted value an anomaly.
    shadow_price_threshold : float, optional
        Threshold on the shadow price to consider a reaction a strong thermodynamic
        constrain.
    max_extracellular_conc : Q, optional
        Threshold on the concentration to consider a predicted concentration an anomaly.
    """

    def __init__(
        self,
        pmo_problem: PmoProblem,
        z_score_threshold: float = default_theta_z,
        shadow_price_threshold: float = default_theta_s,
        max_extracellular_conc: Q = None,
    ):
        self._theta_z = z_score_threshold
        self._theta_s = shadow_price_threshold
        self._max_ex_conc = max_extracellular_conc or Q(
            default_max_non_intracellular_conc_mM, "mM"
        )
        self._is_intracellular = pd.DataFrame(
            [
                m.compartment not in non_intracellular_compartment_ids
                for m in pmo_problem.T.metabolites
            ]
        )

        # Restrict the PMO problem to the predicted directions. This way, the
        # problem is not an MILP anymore and solvers should be able to return
        # shadow prices.
        orthant_pmo = pmo_problem.rebuild_for_directions(pmo_problem.d.value)
        status = orthant_pmo.solve()

        assert status == "optimal" or status == "optimal_inaccurate"
        if status == "optimal_inaccurate":
            logger.warning("Inaccurate solution found when computing shadow prices.")

        # Assemble metabolite and reaction data in two data frames.
        self._metabolites_df = pd.DataFrame(
            {
                "id": pmo_problem.T.metabolite_ids,
                "conc": np.exp(pmo_problem.log_c).ravel() * 1000
                if pmo_problem.log_c is not None
                else np.nan,
                "z_log_c": self._make_z_scores(pmo_problem, "log_c"),
            }
        )

        # Verify whether the shadow prices were computed correctly.
        if (
            orthant_pmo.sign_constraints[0].dual_value is None
            or orthant_pmo.sign_constraints[1].dual_value is None
        ):
            logger.warning(
                "Unable to compute shadow prices. Solve again with verbose=True to see the solver output."
            )
            sp_drg = np.zeros(len(pmo_problem.T.reaction_ids)) * np.nan
        else:
            sp_drg = np.maximum(
                orthant_pmo.sign_constraints[0].dual_value,
                orthant_pmo.sign_constraints[1].dual_value,
            ).ravel()

        self._reactions_df = pd.DataFrame(
            {
                "id": pmo_problem.T.reaction_ids,
                "v": pmo_problem.v[pmo_problem._rxn_idxs_F_to_T].ravel(),
                "drg0": pmo_problem.drg0.ravel()
                if pmo_problem.drg0 is not None
                else np.nan,
                "drg": pmo_problem.drg.ravel()
                if pmo_problem.drg is not None
                else np.nan,
                "z_drg": self._make_z_scores(pmo_problem, "drg"),
                "z_drg0": self._make_z_scores(pmo_problem, "drg0"),
                "sp_drg": sp_drg,
            }
        )

    @property
[docs]    def metabolites_df(self) -> pd.DataFrame:
        """Gets a data frame with the metabolite-related quantities."""
        return self._metabolites_df

    @property
[docs]    def reactions_df(self) -> pd.DataFrame:
        """Gets a data frame with the reaction-related quantities."""
        return self._reactions_df

    @property
[docs]    def theta_z(self) -> float:
        """Gets the mimimum absolute z-score used to classify a metabolite
        concentration as anomaly."""
        return self._theta_z

    @property
[docs]    def theta_s(self) -> float:
        """Gets the mimimum absolute shadow price to flag a constraint."""
        return self._theta_s

    @property
[docs]    def max_ex_concentration(self) -> float:
        """Gets the minimum concentration used to classify a non-intracellular
        metabolite concentration as anomaly."""
        return self._max_ex_conc

[docs]    def summary(self):
        """Print a summary of the quantitative assessment."""

        print("Quantitative thermodynamic assessment summary:")
        print("------------------------------------------------")
        print("concentrations: mM, free energies: kJ/mol")
        print("")

        pd_context = pd.option_context(
            "display.max_rows",
            None,
            "display.max_columns",
            None,
            "display.precision",
            3,
        )

        # Search for anomalies based on concentration z-scores.
        anomalies_m_z = self.metabolites_df[
            self.metabolites_df["z_log_c"].abs() > self.theta_z
        ]
        anomalies_m_z = anomalies_m_z.reindex(
            anomalies_m_z["z_log_c"].abs().sort_values(ascending=False).index
        )
        if not anomalies_m_z.empty:
            print(
                f"> The following metabolites have been flagged as "
                f"anomalies because their predicted concentration has an "
                f"absolute z-score greater than {self.theta_z}:"
            )
            with pd_context:
                print(anomalies_m_z)
        else:
            print("> No anomaly found in concentration z-scores.")
        print("")

        # Search for anomalies based on maximum concentration.
        max_ex_concentration_mM = self.max_ex_concentration.m_as("mM")
        anomalies_m_m = self.metabolites_df[
            (self.metabolites_df["conc"] >= max_ex_concentration_mM)
            & ~pd.DataFrame(self._is_intracellular)[0]
            & (self.metabolites_df["z_log_c"].abs() > 0)
        ]
        anomalies_m_m = anomalies_m_m.reindex(
            anomalies_m_m["conc"].abs().sort_values(ascending=False).index
        )
        if not anomalies_m_m.empty:
            print(
                f"> The following non-intracellular metabolites have been "
                f"flagged as anomalies because they have concentation "
                f"greater than {max_ex_concentration_mM} mM:"
            )
            with pd_context:
                print(anomalies_m_m)
        else:
            print("> No anomaly found in non-intracellular concentrations.")
        print("")

        # Search for anomalies based on reaction z-scores.
        anomalies_r_z = self.reactions_df[
            (self.reactions_df["z_drg"].abs() > self.theta_z)
            | (self.reactions_df["z_drg0"].abs() > self.theta_z)
        ]
        anomalies_r_z = anomalies_r_z.reindex(
            anomalies_r_z[["z_drg", "z_drg0"]]
            .abs()
            .max(axis=1)
            .sort_values(ascending=False)
            .index
        )
        if not anomalies_r_z.empty:
            print(
                f"> The following reactions have been flagged as anomalies "
                f"because their predicted free energy or standard free "
                f"energy has an absolute z-score greater than {self.theta_z}:"
            )
            with pd_context:
                print(anomalies_r_z)
        else:
            print("> No anomaly found in free energy z-scores.")
        print("")

        # Search for anomalies based on reaction shadow prices.
        anomalies_r_s = self.reactions_df[(self.reactions_df["sp_drg"] > self.theta_s)]
        anomalies_r_s = anomalies_r_s.reindex(
            anomalies_r_s["sp_drg"].sort_values(ascending=False).index
        )
        if not anomalies_r_s.empty:
            print(
                f"> The following reactions are predicted to impose strong "
                f"thermodynamic constraints on the network because their "
                f"shadow price is greater than {self.theta_s}:"
            )
            with pd_context:
                print(anomalies_r_s)
        else:
            print("> No significant thermodynamic constraints found.")
        print("")

    def _make_z_scores(
        self, pmo_problem: PmoProblem, variable: str
    ) -> Union[np.array, float]:
        """Construct the z-scores for a specific type of variables."""
        transform = None
        if variable == "log_c":
            if pmo_problem.B.to_log_conc_transform is not None:
                transform = pmo_problem.B.to_log_conc_transform[0]
        elif variable == "drg":
            if pmo_problem.B.to_drg_transform is not None:
                transform = pmo_problem.B.to_drg_transform[0]
        elif variable == "drg0":
            if pmo_problem.B.to_drg0_transform is not None:
                transform = pmo_problem.B.to_drg0_transform[0]
        else:
            raise ValueError(f"Unsupported variable {variable}")

        if transform is None:
            return np.nan
        else:
            std_devs = np.sqrt(np.diag(transform @ transform.T))
            scaling = np.zeros(std_devs.shape)
            scaling[std_devs > 0] = np.reciprocal(std_devs[std_devs > 0])
            scaling = np.diag(scaling)
            return (scaling @ transform @ pmo_problem.m.value).ravel()


[docs]def prepare_for_pta(
    model: cobra.Model,
    biomass_id: Optional[str] = None,
    atpm_id: str = "ATPM",
    default_bound: float = default_flux_bound,
    autoresolve_inconsistencies: bool = True,
    remove_blocked_reactions: bool = True,
    tighten_bounds: bool = True,
    prevent_loops: bool = False,
):
    """Attempt to automatically prepare a model for use in PTA.

    This method performs three actions:

    * Runs structural assessment on the model and attempts to autoresolve possible
      inconsistencies.
    * Removes all blocked reactions from the model.
    * Runs FVA to tighten the flux bounds of each reaction.

    Parameters
    ----------
    model : cobra.Model
        The target model.
    biomass_id : Optional[str], optional
        ID of the biomass reactions. If None, the method will try to find it
        automatically.
    atpm_id : str, optional
        ID of the ATP maintenance reaction.
    default_bound : float, optional
        Default bound to use for unconstrained reactions.
    autoresolve_inconsistencies : bool, optional
        True if the method should attempt to automatically resolve inconsistencies,
        false otherwise. By default True.
    remove_blocked_reactions : bool, optional
        True if the method should remove blocked reactions, false otherwise. By default
        True.
    tighten_bounds : bool, optional
        True if the method should restrict the flux bounds with FVA, false otherwise. By
        default True.
    """

    logger.info("Preparing model for PTA ...")
    result = model.optimize()
    assert result.status != "infeasible", "Model is infeasible."

    # Run structural assessment and auto-resolve all inconsistencies.
    if autoresolve_inconsistencies:
        assessment = StructuralAssessment(model, biomass_id, atpm_id)
        assessment.autoresolve(model, default_bound)

    if remove_blocked_reactions or tighten_bounds:
        fva_result = flux_variability_analysis(
            model, loopless=prevent_loops, fraction_of_optimum=0.0
        )

    # Remove blocked reactions to make sure that all reactions can have a
    # well-defined direction.
    if remove_blocked_reactions:
        model.remove_reactions(
            find_blocked_reactions_from_fva_solution(model, fva_result=fva_result),
            remove_orphans=True,
        )

    # Restrict reaction bounds using FVA.
    if tighten_bounds:
        tighten_model_bounds(model, prevent_loops=prevent_loops, fva_result=fva_result)
    logger.info("Model preprocessing completed.")