nf-core · keiran-rowell-unsw · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026
diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml
diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf
diff --git a/multiqc_proteinfold/__init__.py b/multiqc_proteinfold/__init__.py
@@ -0,0 +1,6 @@
+"""MultiQC plugin for proteinfold pipeline outputs converting metrics to simple tsvs."""
+
+from .proteinfold import MultiqcModule
+
+__version__ = '0.1.0'
+__all__ = ["MultiqcModule"]
diff --git a/multiqc_proteinfold/multiqc_config.yaml b/multiqc_proteinfold/multiqc_config.yaml
@@ -0,0 +1,7 @@
+sp:
+  proteinfold:
+    fn: "*_{plddt,msa,ptm,iptm,pae}.tsv"
+
+custom_logo: "/srv/scratch/z3374843/MultiQC/multiqc/logo/SBF_logo.png"
+custom_logo_url: "https://doi.org/10.26190/4KQF-M552"
+custom_logo_title: "Structural Biology Facility - Mark Wainwright Analytical Centre - University of New South Wales"
diff --git a/multiqc_proteinfold/proteinfold.py b/multiqc_proteinfold/proteinfold.py
@@ -0,0 +1,271 @@
+from pathlib import Path
+import pandas as pd
+
+from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound
+from multiqc.plots import bargraph, linegraph
+from multiqc import config  # I want the ranks to merge by default, not a user problem
+from multiqc.plots.table_object import ColumnDict
+
+from typing import Dict, Any, cast
+
+
+class MultiqcModule(BaseMultiqcModule):
+    """
+    The module parses results generated by a variety of protein structure prediction programs in the [ProteinFold](https://nf-co.re/proteinfold/) pipeline.
+    This includes (as of release v2.0):
+        - [AlphaFold2](https://github.com/google-deepmind/alphafold)
+        - [AlphaFold3](https://github.com/google-deepmind/alphafold3)
+        - [ColabFold](https://github.com/sokrypton/ColabFold)
+        - [ESMFold](https://github.com/facebookresearch/esm)
+        - [RoseTTaFold-All-Atom](https://github.com/baker-laboratory/RoseTTAFold-All-Atom)
+        - [RoseTTaFold2-Nucleic-Acids](https://github.com/uw-ipd/RoseTTAFold2NA) [Wait on merge]
+        - [HelixFold3](https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold3)
+        - [Boltz](https://github.com/jwohlwend/boltz)
+
+    This is intended to provide a summary of useful metrics for mass 'folding' a large set of proteins, either in terms of fishing for mulitmer interactions or comparing methods across whole proteomes.
+    It provides a visual 'at-a-glance' report of relevant metrics (average pLDDT, ipTM, *etc*) and does not replace the per-protein interactive plot from GENEREATE_REPORT in  nfcore/proteinfold
+    """
+
+    def __init__(self):
+        super(MultiqcModule, self).__init__(
+            name="ProteinFold",
+            anchor="proteinfold",
+            href=[
+                "https://nf-co.re/proteinfold",
+                "https://github.com/google-deepmind/alphafold",
+                "https://github.com/google-deepmind/alphafold3",
+                "https://github.com/sokrypton/ColabFold",
+                "https://github.com/facebookresearch/esm",
+                "https://github.com/baker-laboratory/RoseTTAFold-All-Atom",
+                "https://github.com/uw-ipd/RoseTTAFold2NA",
+                "https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold3",
+                "https://github.com/jwohlwend/boltz",
+            ],
+            info="ProteinFold - protein structure inference methods through a single nextflow pipeline interface",
+            doi=[
+                "10.1038/s41586-021-03819-2",
+                "10.1038/s41592-022-01488-1",
+                "10.1126/science.ade2574",
+                "10.1126/science.adl2528",
+                "10.1038/s41592-023-02086-5",
+                "10.48550/arXiv.2408.16975",
+                "10.1101/2024.11.19.624167",
+            ],
+        )
+
+        # Want to treat the ranked inference runs as 'sub-samples' for grouping logic, even if not separate files
+        if not hasattr(config, "table_sample_merge"):
+            config.table_sample_merge = {}
+
+        # Some codes generated 5 inferences for 5 models and all 25 are processed. If a user sets more they're an expert and can custom handle
+        config.table_sample_merge = {
+            "rank_0": ["_rank_0"],
+            "rank_1": ["_rank_1"],
+            "rank_2": ["_rank_2"],
+            "rank_3": ["_rank_3"],
+            "rank_4": ["_rank_4"],
+            "rank_5": ["_rank_5"],
+            "rank_6": ["_rank_6"],
+            "rank_7": ["_rank_7"],
+            "rank_8": ["_rank_8"],
+            "rank_9": ["_rank_9"],
+            "rank_10": ["_rank_10"],
+            "rank_11": ["_rank_11"],
+            "rank_12": ["_rank_12"],
+            "rank_13": ["_rank_13"],
+            "rank_14": ["_rank_14"],
+            "rank_15": ["_rank_15"],
+            "rank_16": ["_rank_16"],
+            "rank_17": ["_rank_17"],
+            "rank_18": ["_rank_18"],
+            "rank_19": ["_rank_19"],
+            "rank_20": ["_rank_20"],
+            "rank_21": ["_rank_21"],
+            "ranmodek_22": ["_rank_22"],
+            "rank_23": ["_rank_23"],
+            "rank_24": ["_rank_24"],
+        }
+
+        mode_dict = {
+            "alphafold2": "AlphaFold2",
+            "alphafold3": "AlphaFold3",
+            "colabfold": "ColabFold",
+            "esmfold": "ESMFold",
+            "rosettafold2na": "RoseTTAFold2-Nucleic-Acids",
+            "rosettafold_all_atom": "RoseTTAFold-All-Atom",
+            "helixfold3": "HelixFold3",
+            "boltz": "Boltz",
+        }
+
+        self.proteinfold_data: Dict[str, Any] = {}
+        # I want to enable sample grouping: https://docs.seqera.io/multiqc/reports/customisation#sample-grouping
+
+        for f in self.find_log_files("proteinfold"):
+            self.add_data_source(f)
+
+            raw_samplename = f["s_name"].split("_")[0]
+            filepath = Path(f["root"]) / f["fn"]
+            mode = "UNKNOWN"
+            for parent in filepath.parents:
+                if parent.name in mode_dict:  # traverse up the filepath until you hit a mode labelled dir
+                    mode = mode_dict[parent.name]
+                    break
+
+            mode_samplename = f"{raw_samplename}_{mode}"
+            samplename = self.clean_s_name(mode_samplename, f)
+            print(filepath)
+            print(samplename)
+
+            self.proteinfold_data.setdefault(samplename, {})  # Set default creates if doesn't already exist
+
+            if f["fn"].endswith("_msa.tsv"):
+                self.proteinfold_data[samplename]["msa_depth"] = f["f"].count("\n")
+
+            if f["fn"].endswith("_plddt.tsv"):
+                df = pd.read_csv(filepath, sep="\t")
+                rank_cols = [col for col in df.columns if col.startswith("rank_")]
+
+                # Full plddt data frame for plotting purposes
+                plddt_data = {col: df.set_index("Positions")[col].to_dict() for col in rank_cols}
+                self.proteinfold_data[samplename]["plddt"] = plddt_data
+
+                rank_means = df[rank_cols].mean().to_dict()
+                # Parent sample entry should still have the top ranked value
+                self.proteinfold_data[samplename]["mean_plddt"] = rank_means.get("rank_0")
+
+                for rank in rank_cols:
+                    rank_num = rank.split("rank_")[1]
+                    subsample = f"{samplename}_rank_{rank_num}"
+
+                    self.proteinfold_data.setdefault(subsample, {})
+                    self.proteinfold_data[subsample]["mean_plddt"] = rank_means[rank]
+
+            if f["fn"].endswith("_iptm.tsv") and not f["fn"].endswith("_chainwise_iptm.tsv"):
+                iptm_series = cast(
+                    pd.Series, pd.read_csv(filepath, sep="\t", header=None, index_col=0).squeeze(axis=1)
+                )  # Squeeze makes a series accessible on index label
+                if (
+                    0 in iptm_series.index
+                ):  # Since pandas infers the index as an int this is an exact int match not a greedy string match
+                    self.proteinfold_data[samplename]["iptm"] = iptm_series.loc[
+                        0
+                    ]  # Remember loc is an *int* match on rank 0 index
+
+                for rank_num in iptm_series.index:
+                    subsample = f"{samplename}_rank_{rank_num}"
+                    self.proteinfold_data.setdefault(subsample, {})["iptm"] = iptm_series.loc[rank_num]
+
+            if f["fn"].endswith("_ptm.tsv") and not f["fn"].endswith("_chainwise_ptm.tsv"):
+                ptm_series = cast(
+                    pd.Series, pd.read_csv(filepath, sep="\t", header=None, index_col=0).squeeze(axis=1)
+                )  # Squeeze on axis avoids int conversion for single entries
+                if 0 in ptm_series.index:
+                    self.proteinfold_data[samplename]["ptm"] = ptm_series.loc[0]
+
+                for rank_num in ptm_series.index:
+                    subsample = f"{samplename}_rank_{rank_num}"
+                    self.proteinfold_data.setdefault(subsample, {})["ptm"] = ptm_series.loc[rank_num]
+
+        self.write_data_file(
+            self.proteinfold_data, "proteinfold_data"
+        )  # I want to structure and rename from avg_plDDT to summary_stats
+        self.general_stats_table()
+        # Togglable plDDT by residue plots of all ranks
+        self.plddt_line_plot()
+
+    def general_stats_table(self):
+        """
+        Put protein structure prediction metrics into a general table for all different Deep Learning methods
+        """
+        # Check for empy metrics to drop those columns where not appropriate
+
+        has_iptm = any(
+            sample_data.get("iptm") and sample_data.get("iptm") != 0.0 for sample_data in self.proteinfold_data.values()
+        )
+        has_ptm = any(
+            sample_data.get("ptm") and sample_data.get("ptm") != 0.0 for sample_data in self.proteinfold_data.values()
+        )
+
+        headers: Dict[str, ColumnDict] = {
+            "msa_depth": {
+                "title": "Related sequence depth (MSA)",
+                "description": "The number of related sequences (across the whole protein) that could be retrieved from the MSA (Multiple Sequence Alignment) stage",
+            },
+            "mean_plddt": {
+                "title": "Structure confidence (average pLDDT)",
+                "description": "Structure prediction confidence score across all residues in the top ranked protein structure - from the mean pLDDT (predicted Local Distance Difference Test) value",
+                "max": 100,
+                "min": 0,
+                "cond_formatting_rules": {
+                    "very-low": [{"lt": 50}],
+                    "low": [{"gt": 50}, {"lt": 70}],
+                    "high": [{"gt": 70}, {"lt": 90}],
+                    "very-high": [{"gt": 90}],
+                },
+                "cond_formatting_colours": [
+                    {"very-low": "#f0743e"},
+                    {"low": "#f9d613"},
+                    {"high": "#60c2e8"},
+                    {"very-high": "#014ecc"},
+                ],
+            },
+        }
+
+        if has_iptm:
+            headers["iptm"] = {
+                "title": "Interface accuracy (ipTM)",
+                "description": "Accuracy of the relative positions of two protein subunits from a mulitmer calcuation - from the ipTM (interface predicted Template Modelling) score",
+                "max": 1,
+                "min": 0,
+                "format": "{:,.2f}",
+                "scale": "Purples",
+            }
+
+        if has_ptm:
+            headers["ptm"] = {
+                "title": "Global accuracy (TM)",
+                "description": "Global accuracy of the protein folded, less sensitive to localised inaccuracies than raw 3D atomic deviations (RMSD) - from the pTM (predicted Template Modelling) score",
+                "max": 1,
+                "min": 0,
+                "format": "{:,.2f}",
+                "scale": "Blues",
+            }
+
+        self.general_stats_addcols(self.proteinfold_data, headers)
+
+    def plddt_line_plot(self):
+        """Line plot showing pLDDT confidence across residue position of selected sample, for all ranks"""
+
+        parent_samples = {}
+
+        for sample, metrics in self.proteinfold_data.items():
+            if "plddt" in metrics:
+                if "_rank_" not in sample:  # The parent sample already has the plddt data for all ranks
+                    parent_samples[sample] = metrics["plddt"]
+
+        data_labels = []  # Need a data_labels list for the sample switcher
+        plot_data_list = []
+
+        # The populated data_labels plot config section is what the switcher uses
+        for parent_sample, rank_data in parent_samples.items():
+            data_labels.append({"name": parent_sample, "ylab": "pLDDT score"})
+            plot_data_list.append(rank_data)
+
+        pconfig = {
+            "id": "proteinfold_plddt_lineplot",
+            "title": "ProteinFold: pLDDT by Position",
+            "xlab": "Residue Position",
+            "ylab": "pLDDT Score",
+            "ymin": 0,
+            "ymax": 100,
+            "data_labels": data_labels,
+        }
+
+        plot_html = linegraph.plot(plot_data_list, pconfig)
+
+        self.add_section(
+            name="pLDDT Confidence by residue",
+            anchor="proteinfold-plddt-per-res",
+            description="Per-residue confidence scores across all predicted ranks",
+            plot=plot_html,
+        )
diff --git a/setup.py b/setup.py
@@ -0,0 +1,28 @@
+from setuptools import setup, find_packages
+
+setup(
+    name='multiqc-proteinfold',
+    version='0.1.0',
+    author='Keiran Rowell',
+    author_email='k.rowell@unsw.edu.au',
+    description='MultiQC plugin for proteinfold pipeline metric tsv outputs',
+    url='https://github.com/nf-core/proteinfold',
+    packages=find_packages(),
+    include_package_data=True,
+    entry_points={
+        'multiqc.modules.v1': [
+            'proteinfold = multiqc_proteinfold.proteinfold:MultiqcModule',
+        ],
+    },
+    install_requires=[
+        'multiqc>=1.15',
+        'pandas',
+    ],
+    classifiers=[
+        'Development Status :: 4 - Beta',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: MIT License',
+        'Programming Language :: Python :: 3',
+    ],
+    python_requires='>=3.8',
+)