Export

Exporting of proteomics data from Qtable into external formats.

This module offers functionalities to convert and save Qtable data into files compatible with external tools (Amica and Perseus), and creating sequence coverage maps in HTML format. While most functions operate on Qtable instances, some may accept other data structures.

Classes:

Name	Description
`Protein`	Abstract protein entry
`ProteinDatabase`	Abstract protein database

Functions:

Name	Description
`contaminants_to_clipboard`	Creates a contaminant table and writes it to the system clipboard.
`to_perseus_matrix`	Exports a qtable to a perseus matrix file in tsv format.
`to_amica`	Exports a qtable to an amica protein table and design files.
`write_html_coverage_map`	Generates an html file containing a protein coverage map.

Protein

Bases: Protocol

Abstract protein entry

ProteinDatabase

Bases: Protocol

Abstract protein database

contaminants_to_clipboard

contaminants_to_clipboard(qtable: Qtable) -> None

Creates a contaminant table and writes it to the system clipboard.

The contaminant table contains "iBAQ rank", "riBAQ", "iBAQ intensity", "Intensity", and "Expression" columns for each sample. Imputed values in the "Expression" columns are set to NaN.

The qtable must at least contain "iBAQ intensity" and "Missing" sample columns, and a "Potential contaminant" column, expression columns must be set. For calculation of iBAQ intensities refer to msreport.reader.add_ibaq_intensities(). "Missing" sample columns can be added with msreport.analyze.analyze_missingness().

Parameters:

Name	Type	Description	Default
`qtable`	`Qtable`	A Qtable instance. Requires that column names follow the MsReport conventions.	required

Source code in msreport\export.py

def contaminants_to_clipboard(qtable: Qtable) -> None:
    """Creates a contaminant table and writes it to the system clipboard.

    The contaminant table contains "iBAQ rank", "riBAQ", "iBAQ intensity", "Intensity",
    and "Expression" columns for each sample. Imputed values in the "Expression" columns
    are set to NaN.

    The qtable must at least contain "iBAQ intensity" and "Missing" sample columns, and
    a "Potential contaminant" column, expression columns must be set. For calculation
    of iBAQ intensities refer to msreport.reader.add_ibaq_intensities(). "Missing"
    sample columns can be added with msreport.analyze.analyze_missingness().

    Args:
        qtable: A Qtable instance. Requires that column names follow the MsReport
            conventions.
    """
    columns = [
        "Representative protein",
        "Protein entry name",
        "Gene name",
        "Fasta header",
        "Protein length",
        "Total peptides",
        "iBAQ peptides",
        "iBAQ intensity total",
    ]
    column_tags = ["iBAQ rank", "riBAQ", "iBAQ intensity", "Intensity", "Expression"]

    samples = qtable.get_samples()
    data = qtable.get_data()

    data["iBAQ intensity total"] = np.nansum(
        data[[f"iBAQ intensity {s}" for s in samples]], axis=1
    ) / len(samples)
    for sample in samples:
        data.loc[data[f"Missing {sample}"], f"Expression {sample}"] = np.nan

        ibaq_values = data[f"iBAQ intensity {sample}"]
        order = np.argsort(ibaq_values)[::-1]
        rank = np.empty_like(ibaq_values, dtype=int)
        rank[order] = np.arange(1, len(ibaq_values) + 1)
        data[f"iBAQ rank {sample}"] = rank
        data[f"riBAQ {sample}"] = ibaq_values / ibaq_values.sum() * 100

    for column_tag in column_tags:
        columns.extend(helper.find_sample_columns(data, column_tag, samples))
    columns = [c for c in columns if c in data.columns]

    contaminants = qtable["Potential contaminant"]
    data = data.loc[contaminants, columns]

    data.sort_values("iBAQ intensity total", ascending=False, inplace=True)
    data.to_clipboard(index=False)

to_perseus_matrix

to_perseus_matrix(
    qtable: Qtable,
    directory: str | Path,
    table_name: str = "perseus_matrix.tsv",
) -> None

Exports a qtable to a perseus matrix file in tsv format.

The Perseus matrix file has a second header row that contains single-letter entries for column annotations. The first entry starts with the string "#!{Type}" followed by an annotation letter, such as "#!{Type}E".

The annotation single letter code is

E = Expression N = numerical C = Categorical T = Text

Parameters:

Name	Type	Description	Default
`qtable`	`Qtable`	A Qtable instance.	required
`directory`	`str \| Path`	Output path of the generated files.	required
`table_name`	`str`	Optional, filename of the perseus matrix file. Default is "perseus_matrix.tsv".	`'perseus_matrix.tsv'`

Source code in msreport\export.py

def to_perseus_matrix(
    qtable: Qtable,
    directory: str | pathlib.Path,
    table_name: str = "perseus_matrix.tsv",
) -> None:
    """Exports a qtable to a perseus matrix file in tsv format.

    The Perseus matrix file has a second header row that contains single-letter entries
    for column annotations. The first entry starts with the string "#!{Type}" followed
    by an annotation letter, such as "#!{Type}E".

    The annotation single letter code is:
        E = Expression
        N = numerical
        C = Categorical
        T = Text

    Args:
        qtable: A Qtable instance.
        directory: Output path of the generated files.
        table_name: Optional, filename of the perseus matrix file. Default is
            "perseus_matrix.tsv".
    """
    table = qtable.data
    default_category = "T"
    annotation_row_prefix = "#!{Type}"
    categorical_tags = ["Events", "Missing"]

    categorical_columns = ["Potential contaminant", "Valid"]
    for tag in categorical_tags:
        categorical_columns.extend([c for c in table.columns if tag in c])

    expression_columns = [qtable.get_expression_column(s) for s in qtable.get_samples()]

    numeric_columns = table.select_dtypes(include="number").columns.tolist()
    numeric_columns = set(numeric_columns).difference(expression_columns)
    numeric_columns = set(numeric_columns).difference(categorical_columns)

    column_categories: ddict[str, str] = ddict(lambda: default_category)
    column_categories.update(dict.fromkeys(numeric_columns, "N"))
    column_categories.update(dict.fromkeys(categorical_columns, "C"))
    column_categories.update(dict.fromkeys(expression_columns, "E"))

    column_annotation = [column_categories[column] for column in table.columns]
    column_annotation[0] = f"{annotation_row_prefix}{column_annotation[0]}"
    annotation_frame = pd.DataFrame(columns=table.columns, data=[column_annotation])

    perseus_matrix = pd.concat([annotation_frame, table])
    perseus_matrix_path = os.path.join(directory, table_name)
    perseus_matrix.to_csv(perseus_matrix_path, sep="\t", index=False)

to_amica

to_amica(
    qtable: Qtable,
    directory: str | Path,
    table_name: str = "amica_table.tsv",
    design_name: str = "amica_design.tsv",
) -> None

Exports a qtable to an amica protein table and design files.

Note that amica expects the same number of columns for each group of intensity columns (Intensity, LFQIntensity, ImputedIntensity, iBAQ), therefore only sample columns are included from samples that are present in the qtable design.

Parameters:

Name	Type	Description	Default
`qtable`	`Qtable`	A Qtable instance.	required
`directory`	`str \| Path`	Output path of the generated files.	required
`table_name`	`str`	Optional, filename of the amica table file. Default is "amica_table.tsv".	`'amica_table.tsv'`
`design_name`	`str`	Optional, filename of the amica design file. Default is "amica_design.tsv".	`'amica_design.tsv'`

Source code in msreport\export.py

def to_amica(
    qtable: Qtable,
    directory: str | pathlib.Path,
    table_name: str = "amica_table.tsv",
    design_name: str = "amica_design.tsv",
) -> None:
    """Exports a qtable to an amica protein table and design files.

    Note that amica expects the same number of columns for each group of intensity
    columns (Intensity, LFQIntensity, ImputedIntensity, iBAQ), therefore only sample
    columns are included from samples that are present in the qtable design.

    Args:
        qtable: A Qtable instance.
        directory: Output path of the generated files.
        table_name: Optional, filename of the amica table file. Default is
            "amica_table.tsv".
        design_name: Optional, filename of the amica design file. Default is
            "amica_design.tsv".
    """
    amica_table = _amica_table_from(qtable)
    amica_table_path = os.path.join(directory, table_name)
    amica_table.to_csv(amica_table_path, sep="\t", index=False)

    amica_design = _amica_design_from(qtable)
    amica_design_path = os.path.join(directory, design_name)
    amica_design.to_csv(amica_design_path, sep="\t", index=False)

write_html_coverage_map

write_html_coverage_map(
    filepath: str,
    protein_id: str,
    peptide_table: DataFrame,
    protein_db: ProteinDatabase,
    displayed_name: Optional[str] = None,
    coverage_color: str = "#E73C40",
    highlight_positions: Optional[Iterable[int]] = None,
    highlight_color: str = "#1E90FF",
    column_length: int = 10,
    row_length: int = 50,
)

Generates an html file containing a protein coverage map.

Parameters:

Name	Type	Description	Default
`filepath`	`str`	The filepath where the generated html file will be saved.	required
`protein_id`	`str`	ID of the protein that will be displayed on the html page. Must correspond to an entry in the specified `protein_db`.	required
`peptide_table`	`DataFrame`	Dataframe which contains peptide information required for calculation of the protein sequence coverage.	required
`protein_db`	`ProteinDatabase`	A protein database containing entries from one or multiple FASTA files.	required
`displayed_name`	`Optional[str]`	Allows specifying a custom displayed name. By default, the protein name and protein id are shown.	`None`
`coverage_color`	`str`	Hex color code for highlighting amino acids that correspond to covered regions from the coverage mask, for example "#FF0000" for red.	`'#E73C40'`
`highlight_positions`	`Optional[Iterable[int]]`	Optional, allows specifying a list of amino acid positions that are highlighted in a different color. Note that positions specified here will overwrite the coloring from the coverage mask. Positions are one-indexed, which means that the first amino acid positions is 1.	`None`
`highlight_color`	`str`	Hex color code for highlighting amino acids specified with the 'highlight_positions' variable.	`'#1E90FF'`
`column_length`	`int`	Number of amino acids after which a space is inserted.	`10`
`row_length`	`int`	Number of amino acids after which a new line is inserted.	`50`

Source code in msreport\export.py

def write_html_coverage_map(
    filepath: str,
    protein_id: str,
    peptide_table: pd.DataFrame,
    protein_db: ProteinDatabase,
    displayed_name: Optional[str] = None,
    coverage_color: str = "#E73C40",
    highlight_positions: Optional[Iterable[int]] = None,
    highlight_color: str = "#1E90FF",
    column_length: int = 10,
    row_length: int = 50,
):
    """Generates an html file containing a protein coverage map.

    Args:
        filepath: The filepath where the generated html file will be saved.
        protein_id: ID of the protein that will be displayed on the html page. Must
            correspond to an entry in the specified `protein_db`.
        peptide_table: Dataframe which contains peptide information required for
            calculation of the protein sequence coverage.
        protein_db: A protein database containing entries from one or multiple FASTA
            files.
        displayed_name: Allows specifying a custom displayed name. By default, the
            protein name and protein id are shown.
        coverage_color: Hex color code for highlighting amino acids that correspond to
            covered regions from the coverage mask, for example "#FF0000" for red.
        highlight_positions: Optional, allows specifying a list of amino acid positions
            that are highlighted in a different color. Note that positions specified
            here will overwrite the coloring from the coverage mask. Positions are
            one-indexed, which means that the first amino acid positions is 1.
        highlight_color: Hex color code for highlighting amino acids specified with the
            'highlight_positions' variable.
        column_length: Number of amino acids after which a space is inserted.
        row_length: Number of amino acids after which a new line is inserted.
    """
    warnings.warn(
        (
            "`write_html_coverage_map` is still experimental, and the interface might "
            "change in a future release."
        ),
        FutureWarning,
        stacklevel=2,
    )
    # Get protein information from the protein database
    protein_entry = protein_db[protein_id]
    sequence = protein_entry.sequence
    protein_length = len(sequence)

    if displayed_name is None:
        protein_name = msreport.reader._get_annotation_protein_name(
            protein_entry, default_value=protein_id
        )
        if protein_name == protein_id:
            displayed_name = protein_id
        else:
            displayed_name = f"{protein_name} ({protein_id})"

    # Generate coverage boundaries from a peptide table
    id_column = "Representative protein"
    peptide_group = peptide_table[peptide_table[id_column] == protein_id]
    peptide_positions = list(
        zip(peptide_group["Start position"], peptide_group["End position"])
    )
    coverage_mask = helper.make_coverage_mask(protein_length, peptide_positions)
    boundaries = _find_covered_region_boundaries(coverage_mask)

    # Define highlight positions
    highlight_positions = highlight_positions if highlight_positions is not None else ()
    highlights = {pos - 1: highlight_color for pos in highlight_positions}
    html_title = f"Coverage map: {displayed_name}"

    # Generate and save the html page
    sequence_coverage = helper.calculate_sequence_coverage(
        protein_length, peptide_positions, ndigits=1
    )
    html_sequence_map = _generate_html_sequence_map(
        sequence,
        boundaries,
        coverage_color,
        highlights=highlights,
        column_length=column_length,
        row_length=row_length,
    )
    html_text = _generate_html_coverage_map_page(
        html_sequence_map, sequence_coverage, title=html_title
    )
    with open(filepath, "w") as openfile:
        openfile.write(html_text)