Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions modules/nf-core/multiqc/environment.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions modules/nf-core/multiqc/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions multiqc_proteinfold/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""MultiQC plugin for proteinfold pipeline outputs converting metrics to simple tsvs."""

from .proteinfold import MultiqcModule

__version__ = '0.1.0'
__all__ = ["MultiqcModule"]
7 changes: 7 additions & 0 deletions multiqc_proteinfold/multiqc_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
sp:
proteinfold:
fn: "*_{plddt,msa,ptm,iptm,pae}.tsv"

custom_logo: "/srv/scratch/z3374843/MultiQC/multiqc/logo/SBF_logo.png"
custom_logo_url: "https://doi.org/10.26190/4KQF-M552"
custom_logo_title: "Structural Biology Facility - Mark Wainwright Analytical Centre - University of New South Wales"
271 changes: 271 additions & 0 deletions multiqc_proteinfold/proteinfold.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
from pathlib import Path
import pandas as pd

from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound
from multiqc.plots import bargraph, linegraph
from multiqc import config # I want the ranks to merge by default, not a user problem
from multiqc.plots.table_object import ColumnDict

from typing import Dict, Any, cast


class MultiqcModule(BaseMultiqcModule):
"""
The module parses results generated by a variety of protein structure prediction programs in the [ProteinFold](https://nf-co.re/proteinfold/) pipeline.
This includes (as of release v2.0):
- [AlphaFold2](https://github.com/google-deepmind/alphafold)
- [AlphaFold3](https://github.com/google-deepmind/alphafold3)
- [ColabFold](https://github.com/sokrypton/ColabFold)
- [ESMFold](https://github.com/facebookresearch/esm)
- [RoseTTaFold-All-Atom](https://github.com/baker-laboratory/RoseTTAFold-All-Atom)
- [RoseTTaFold2-Nucleic-Acids](https://github.com/uw-ipd/RoseTTAFold2NA) [Wait on merge]
- [HelixFold3](https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold3)
- [Boltz](https://github.com/jwohlwend/boltz)

This is intended to provide a summary of useful metrics for mass 'folding' a large set of proteins, either in terms of fishing for mulitmer interactions or comparing methods across whole proteomes.
It provides a visual 'at-a-glance' report of relevant metrics (average pLDDT, ipTM, *etc*) and does not replace the per-protein interactive plot from GENEREATE_REPORT in nfcore/proteinfold
"""

def __init__(self):
super(MultiqcModule, self).__init__(
name="ProteinFold",
anchor="proteinfold",
href=[
"https://nf-co.re/proteinfold",
"https://github.com/google-deepmind/alphafold",
"https://github.com/google-deepmind/alphafold3",
"https://github.com/sokrypton/ColabFold",
"https://github.com/facebookresearch/esm",
"https://github.com/baker-laboratory/RoseTTAFold-All-Atom",
"https://github.com/uw-ipd/RoseTTAFold2NA",
"https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold3",
"https://github.com/jwohlwend/boltz",
],
info="ProteinFold - protein structure inference methods through a single nextflow pipeline interface",
doi=[
"10.1038/s41586-021-03819-2",
"10.1038/s41592-022-01488-1",
"10.1126/science.ade2574",
"10.1126/science.adl2528",
"10.1038/s41592-023-02086-5",
"10.48550/arXiv.2408.16975",
"10.1101/2024.11.19.624167",
],
)

# Want to treat the ranked inference runs as 'sub-samples' for grouping logic, even if not separate files
if not hasattr(config, "table_sample_merge"):
config.table_sample_merge = {}

# Some codes generated 5 inferences for 5 models and all 25 are processed. If a user sets more they're an expert and can custom handle
config.table_sample_merge = {
"rank_0": ["_rank_0"],
"rank_1": ["_rank_1"],
"rank_2": ["_rank_2"],
"rank_3": ["_rank_3"],
"rank_4": ["_rank_4"],
"rank_5": ["_rank_5"],
"rank_6": ["_rank_6"],
"rank_7": ["_rank_7"],
"rank_8": ["_rank_8"],
"rank_9": ["_rank_9"],
"rank_10": ["_rank_10"],
"rank_11": ["_rank_11"],
"rank_12": ["_rank_12"],
"rank_13": ["_rank_13"],
"rank_14": ["_rank_14"],
"rank_15": ["_rank_15"],
"rank_16": ["_rank_16"],
"rank_17": ["_rank_17"],
"rank_18": ["_rank_18"],
"rank_19": ["_rank_19"],
"rank_20": ["_rank_20"],
"rank_21": ["_rank_21"],
"ranmodek_22": ["_rank_22"],
"rank_23": ["_rank_23"],
"rank_24": ["_rank_24"],
}

mode_dict = {
"alphafold2": "AlphaFold2",
"alphafold3": "AlphaFold3",
"colabfold": "ColabFold",
"esmfold": "ESMFold",
"rosettafold2na": "RoseTTAFold2-Nucleic-Acids",
"rosettafold_all_atom": "RoseTTAFold-All-Atom",
"helixfold3": "HelixFold3",
"boltz": "Boltz",
}

self.proteinfold_data: Dict[str, Any] = {}
# I want to enable sample grouping: https://docs.seqera.io/multiqc/reports/customisation#sample-grouping

for f in self.find_log_files("proteinfold"):
self.add_data_source(f)

raw_samplename = f["s_name"].split("_")[0]
filepath = Path(f["root"]) / f["fn"]
mode = "UNKNOWN"
for parent in filepath.parents:
if parent.name in mode_dict: # traverse up the filepath until you hit a mode labelled dir
mode = mode_dict[parent.name]
break

mode_samplename = f"{raw_samplename}_{mode}"
samplename = self.clean_s_name(mode_samplename, f)
print(filepath)
print(samplename)

self.proteinfold_data.setdefault(samplename, {}) # Set default creates if doesn't already exist

if f["fn"].endswith("_msa.tsv"):
self.proteinfold_data[samplename]["msa_depth"] = f["f"].count("\n")

if f["fn"].endswith("_plddt.tsv"):
df = pd.read_csv(filepath, sep="\t")
rank_cols = [col for col in df.columns if col.startswith("rank_")]

# Full plddt data frame for plotting purposes
plddt_data = {col: df.set_index("Positions")[col].to_dict() for col in rank_cols}
self.proteinfold_data[samplename]["plddt"] = plddt_data

rank_means = df[rank_cols].mean().to_dict()
# Parent sample entry should still have the top ranked value
self.proteinfold_data[samplename]["mean_plddt"] = rank_means.get("rank_0")

for rank in rank_cols:
rank_num = rank.split("rank_")[1]
subsample = f"{samplename}_rank_{rank_num}"

self.proteinfold_data.setdefault(subsample, {})
self.proteinfold_data[subsample]["mean_plddt"] = rank_means[rank]

if f["fn"].endswith("_iptm.tsv") and not f["fn"].endswith("_chainwise_iptm.tsv"):
iptm_series = cast(
pd.Series, pd.read_csv(filepath, sep="\t", header=None, index_col=0).squeeze(axis=1)
) # Squeeze makes a series accessible on index label
if (
0 in iptm_series.index
): # Since pandas infers the index as an int this is an exact int match not a greedy string match
self.proteinfold_data[samplename]["iptm"] = iptm_series.loc[
0
] # Remember loc is an *int* match on rank 0 index

for rank_num in iptm_series.index:
subsample = f"{samplename}_rank_{rank_num}"
self.proteinfold_data.setdefault(subsample, {})["iptm"] = iptm_series.loc[rank_num]

if f["fn"].endswith("_ptm.tsv") and not f["fn"].endswith("_chainwise_ptm.tsv"):
ptm_series = cast(
pd.Series, pd.read_csv(filepath, sep="\t", header=None, index_col=0).squeeze(axis=1)
) # Squeeze on axis avoids int conversion for single entries
if 0 in ptm_series.index:
self.proteinfold_data[samplename]["ptm"] = ptm_series.loc[0]

for rank_num in ptm_series.index:
subsample = f"{samplename}_rank_{rank_num}"
self.proteinfold_data.setdefault(subsample, {})["ptm"] = ptm_series.loc[rank_num]

self.write_data_file(
self.proteinfold_data, "proteinfold_data"
) # I want to structure and rename from avg_plDDT to summary_stats
self.general_stats_table()
# Togglable plDDT by residue plots of all ranks
self.plddt_line_plot()

def general_stats_table(self):
"""
Put protein structure prediction metrics into a general table for all different Deep Learning methods
"""
# Check for empy metrics to drop those columns where not appropriate

has_iptm = any(
sample_data.get("iptm") and sample_data.get("iptm") != 0.0 for sample_data in self.proteinfold_data.values()
)
has_ptm = any(
sample_data.get("ptm") and sample_data.get("ptm") != 0.0 for sample_data in self.proteinfold_data.values()
)

headers: Dict[str, ColumnDict] = {
"msa_depth": {
"title": "Related sequence depth (MSA)",
"description": "The number of related sequences (across the whole protein) that could be retrieved from the MSA (Multiple Sequence Alignment) stage",
},
"mean_plddt": {
"title": "Structure confidence (average pLDDT)",
"description": "Structure prediction confidence score across all residues in the top ranked protein structure - from the mean pLDDT (predicted Local Distance Difference Test) value",
"max": 100,
"min": 0,
"cond_formatting_rules": {
"very-low": [{"lt": 50}],
"low": [{"gt": 50}, {"lt": 70}],
"high": [{"gt": 70}, {"lt": 90}],
"very-high": [{"gt": 90}],
},
"cond_formatting_colours": [
{"very-low": "#f0743e"},
{"low": "#f9d613"},
{"high": "#60c2e8"},
{"very-high": "#014ecc"},
],
},
}

if has_iptm:
headers["iptm"] = {
"title": "Interface accuracy (ipTM)",
"description": "Accuracy of the relative positions of two protein subunits from a mulitmer calcuation - from the ipTM (interface predicted Template Modelling) score",
"max": 1,
"min": 0,
"format": "{:,.2f}",
"scale": "Purples",
}

if has_ptm:
headers["ptm"] = {
"title": "Global accuracy (TM)",
"description": "Global accuracy of the protein folded, less sensitive to localised inaccuracies than raw 3D atomic deviations (RMSD) - from the pTM (predicted Template Modelling) score",
"max": 1,
"min": 0,
"format": "{:,.2f}",
"scale": "Blues",
}

self.general_stats_addcols(self.proteinfold_data, headers)

def plddt_line_plot(self):
"""Line plot showing pLDDT confidence across residue position of selected sample, for all ranks"""

parent_samples = {}

for sample, metrics in self.proteinfold_data.items():
if "plddt" in metrics:
if "_rank_" not in sample: # The parent sample already has the plddt data for all ranks
parent_samples[sample] = metrics["plddt"]

data_labels = [] # Need a data_labels list for the sample switcher
plot_data_list = []

# The populated data_labels plot config section is what the switcher uses
for parent_sample, rank_data in parent_samples.items():
data_labels.append({"name": parent_sample, "ylab": "pLDDT score"})
plot_data_list.append(rank_data)

pconfig = {
"id": "proteinfold_plddt_lineplot",
"title": "ProteinFold: pLDDT by Position",
"xlab": "Residue Position",
"ylab": "pLDDT Score",
"ymin": 0,
"ymax": 100,
"data_labels": data_labels,
}

plot_html = linegraph.plot(plot_data_list, pconfig)

self.add_section(
name="pLDDT Confidence by residue",
anchor="proteinfold-plddt-per-res",
description="Per-residue confidence scores across all predicted ranks",
plot=plot_html,
)
28 changes: 28 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from setuptools import setup, find_packages

setup(
name='multiqc-proteinfold',
version='0.1.0',
author='Keiran Rowell',
author_email='k.rowell@unsw.edu.au',
description='MultiQC plugin for proteinfold pipeline metric tsv outputs',
url='https://github.com/nf-core/proteinfold',
packages=find_packages(),
include_package_data=True,
entry_points={
'multiqc.modules.v1': [
'proteinfold = multiqc_proteinfold.proteinfold:MultiqcModule',
],
},
install_requires=[
'multiqc>=1.15',
'pandas',
],
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3',
],
python_requires='>=3.8',
)