Skip to content

Commit ae695d7

Browse files
feat: fetch detailed interpro result
1 parent 36e504f commit ae695d7

File tree

3 files changed

+41
-226
lines changed

3 files changed

+41
-226
lines changed
Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
11
{"type": "protein", "content": "P01308"}
22
{"type": "protein", "content": "Q96KN2"}
3-
{"type": "protein", "content": "MGHHHHHHHGSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"}

examples/search/search_protein/search_interpro/search_interpro_config.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,5 +24,4 @@ nodes:
2424
params:
2525
data_source: interpro # data source for searcher, support: wikipedia, google, uniprot, ncbi, interpro
2626
interpro_params:
27-
email: test@example.com # Email address for EBI API requests
2827
api_timeout: 30 # Request timeout in seconds

graphgen/models/searcher/db/interpro_searcher.py

Lines changed: 41 additions & 224 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import re
2-
import time
32
from typing import Dict, Optional
43

54
import requests
@@ -20,227 +19,29 @@ class InterProSearch(BaseSearcher):
2019
InterPro Search client to search protein domains and functional annotations.
2120
Supports:
2221
1) Get protein domain information by UniProt accession number.
23-
2) Search with protein sequence using EBI InterProScan API.
24-
3) Parse domain matches and associated GO terms, pathways.
2522
26-
API Documentation: https://www.ebi.ac.uk/Tools/services/rest/iprscan5
23+
API Documentation: https://www.ebi.ac.uk/interpro/api/
2724
"""
2825

2926
def __init__(
3027
self,
31-
email: str = "graphgen@example.com",
3228
api_timeout: int = 30,
3329
):
3430
"""
3531
Initialize the InterPro Search client.
3632
3733
Args:
38-
email (str): Email address for EBI API requests.
3934
api_timeout (int): Request timeout in seconds.
4035
"""
41-
self.base_url = "https://www.ebi.ac.uk/Tools/services/rest/iprscan5"
42-
self.email = email
4336
self.api_timeout = api_timeout
44-
self.poll_interval = 5 # Fixed interval between status checks
45-
self.max_polls = 120 # Maximum polling attempts (10 minutes with 5s interval)
46-
47-
@staticmethod
48-
def _is_protein_sequence(text: str) -> bool:
49-
"""Check if text looks like a protein sequence."""
50-
# Remove common FASTA header prefix
51-
if text.startswith(">"):
52-
text = "\n".join(text.split("\n")[1:])
53-
# Check if contains mostly protein amino acids
54-
text = text.strip().replace("\n", "").replace(" ", "")
55-
# Protein sequences contain only A-Z letters (standard amino acids)
56-
return bool(re.fullmatch(r"[A-Z]+", text, re.I)) and len(text) > 10
37+
self.BASE_URL = "https://www.ebi.ac.uk/interpro/api"
5738

5839
@staticmethod
5940
def _is_uniprot_accession(text: str) -> bool:
6041
"""Check if text looks like a UniProt accession number."""
6142
# UniProt: 6-10 chars starting with letter, e.g., P01308, Q96KN2
6243
return bool(re.fullmatch(r"[A-Z][A-Z0-9]{5,9}", text.strip(), re.I))
6344

64-
@retry(
65-
stop=stop_after_attempt(3),
66-
wait=wait_exponential(multiplier=1, min=2, max=5),
67-
retry=retry_if_exception_type(RequestException),
68-
reraise=True,
69-
)
70-
def _submit_job(self, sequence: str, title: str = "") -> Optional[str]:
71-
"""
72-
Submit a protein sequence for InterProScan analysis.
73-
74-
Args:
75-
sequence (str): Protein sequence (FASTA or raw).
76-
title (str): Optional job title.
77-
78-
Returns:
79-
Job ID if successful, None otherwise.
80-
"""
81-
url = f"{self.base_url}/run"
82-
83-
# Parse sequence if FASTA format
84-
if sequence.startswith(">"):
85-
sequence = (
86-
"\n".join(sequence.split("\n")[1:]).replace("\n", "").replace(" ", "")
87-
)
88-
89-
params = {
90-
"email": self.email,
91-
"title": title or "GraphGen_Analysis",
92-
"sequence": sequence,
93-
"stype": "p",
94-
"appl": "NCBIfam,SMART,CDD,HAMAP", # Multiple databases
95-
"goterms": "true",
96-
"pathways": "true",
97-
"format": "json",
98-
}
99-
100-
try:
101-
response = requests.post(url, data=params, timeout=self.api_timeout)
102-
if response.status_code == 200:
103-
job_id = response.text.strip()
104-
logger.debug("InterProScan job submitted: %s", job_id)
105-
return job_id
106-
logger.error(
107-
"Failed to submit InterProScan job: %d - %s",
108-
response.status_code,
109-
response.text,
110-
)
111-
return None
112-
except RequestException as e:
113-
logger.error("Request error while submitting job: %s", e)
114-
raise
115-
116-
@retry(
117-
stop=stop_after_attempt(3),
118-
wait=wait_exponential(multiplier=1, min=2, max=5),
119-
retry=retry_if_exception_type(RequestException),
120-
reraise=True,
121-
)
122-
def _check_status(self, job_id: str) -> Optional[str]:
123-
"""Check the status of a submitted job."""
124-
url = f"{self.base_url}/status/{job_id}"
125-
try:
126-
response = requests.get(url, timeout=self.api_timeout)
127-
if response.status_code == 200:
128-
return response.text.strip()
129-
logger.warning(
130-
"Failed to check job status for %s: %d",
131-
job_id,
132-
response.status_code,
133-
)
134-
return None
135-
except RequestException as e:
136-
logger.error("Request error while checking status: %s", e)
137-
raise
138-
139-
@retry(
140-
stop=stop_after_attempt(3),
141-
wait=wait_exponential(multiplier=1, min=2, max=5),
142-
retry=retry_if_exception_type(RequestException),
143-
reraise=True,
144-
)
145-
def _get_results(self, job_id: str) -> Optional[dict]:
146-
"""Retrieve the analysis results for a completed job."""
147-
url = f"{self.base_url}/result/{job_id}/json"
148-
try:
149-
response = requests.get(url, timeout=self.api_timeout)
150-
if response.status_code == 200:
151-
return response.json()
152-
logger.warning(
153-
"Failed to retrieve results for job %s: %d",
154-
job_id,
155-
response.status_code,
156-
)
157-
return None
158-
except RequestException as e:
159-
logger.error("Request error while retrieving results: %s", e)
160-
raise
161-
162-
def _poll_job(self, job_id: str) -> Optional[dict]:
163-
"""
164-
Poll a job until completion and retrieve results.
165-
166-
Args:
167-
job_id (str): The job ID to poll.
168-
169-
Returns:
170-
Results dictionary if successful, None otherwise.
171-
"""
172-
for attempt in range(self.max_polls):
173-
status = self._check_status(job_id)
174-
175-
if status == "FINISHED":
176-
logger.debug(
177-
"Job %s completed after %d polls",
178-
job_id,
179-
attempt + 1,
180-
)
181-
return self._get_results(job_id)
182-
183-
if status in ["FAILED", "NOT_FOUND"]:
184-
logger.warning("Job %s has status: %s", job_id, status)
185-
return None
186-
187-
if status == "RUNNING":
188-
logger.debug(
189-
"Job %s still running (attempt %d/%d)",
190-
job_id,
191-
attempt + 1,
192-
self.max_polls,
193-
)
194-
time.sleep(self.poll_interval)
195-
else:
196-
logger.debug("Job %s status: %s", job_id, status)
197-
time.sleep(self.poll_interval)
198-
199-
logger.warning(
200-
"Job %s polling timed out after %d attempts", job_id, self.max_polls
201-
)
202-
return None
203-
204-
def search_by_sequence(self, sequence: str) -> Optional[Dict]:
205-
"""
206-
Search for protein domains in a sequence using InterProScan API.
207-
208-
Args:
209-
sequence (str): Protein sequence in FASTA or raw format.
210-
211-
Returns:
212-
Dictionary with domain analysis results or None if failed.
213-
"""
214-
if not sequence or not isinstance(sequence, str):
215-
logger.error("Invalid sequence provided")
216-
return None
217-
218-
sequence = sequence.strip()
219-
220-
if not self._is_protein_sequence(sequence):
221-
logger.error("Invalid protein sequence format")
222-
return None
223-
224-
# Submit job
225-
job_id = self._submit_job(sequence)
226-
if not job_id:
227-
logger.error("Failed to submit InterProScan job")
228-
return None
229-
230-
# Poll for results
231-
results = self._poll_job(job_id)
232-
if not results:
233-
logger.error("Failed to retrieve InterProScan results for job %s", job_id)
234-
return None
235-
236-
return {
237-
"molecule_type": "protein",
238-
"database": "InterPro",
239-
"job_id": job_id,
240-
"content": results,
241-
"url": f"https://www.ebi.ac.uk/interpro/result/{job_id}/",
242-
}
243-
24445
def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
24546
"""
24647
Search InterPro database by UniProt accession number.
@@ -261,7 +62,7 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
26162
accession = accession.strip().upper()
26263

26364
# Query InterPro REST API for UniProt entry
264-
url = f"https://www.ebi.ac.uk/interpro/api/entry/interpro/protein/uniprot/{accession}/"
65+
url = f"{self.BASE_URL}/entry/interpro/protein/uniprot/{accession}/"
26566

26667
response = requests.get(url, timeout=self.api_timeout)
26768

@@ -275,6 +76,14 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
27576

27677
data = response.json()
27778

79+
# Get entry details for each InterPro entry found
80+
for result in data.get("results", []):
81+
interpro_acc = result.get("metadata", {}).get("accession")
82+
if interpro_acc:
83+
entry_details = self.get_entry_details(interpro_acc)
84+
if entry_details:
85+
result["entry_details"] = entry_details
86+
27887
result = {
27988
"molecule_type": "protein",
28089
"database": "InterPro",
@@ -285,6 +94,31 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
28594

28695
return result
28796

97+
def get_entry_details(self, interpro_accession: str) -> Optional[Dict]:
98+
"""
99+
Get detailed information for a specific InterPro entry.
100+
101+
Args:
102+
interpro_accession (str): InterPro accession number (e.g., IPR000001).
103+
Returns:
104+
Dictionary with entry details or None if not found.
105+
"""
106+
if not interpro_accession or not isinstance(interpro_accession, str):
107+
return None
108+
109+
url = f"{self.BASE_URL}/entry/interpro/{interpro_accession}/"
110+
111+
response = requests.get(url, timeout=self.api_timeout)
112+
if response.status_code != 200:
113+
logger.warning(
114+
"Failed to get InterPro entry %s: %d",
115+
interpro_accession,
116+
response.status_code,
117+
)
118+
return None
119+
120+
return response.json()
121+
288122
@retry(
289123
stop=stop_after_attempt(3),
290124
wait=wait_exponential(multiplier=1, min=2, max=5),
@@ -293,14 +127,10 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
293127
)
294128
def search(self, query: str, **kwargs) -> Optional[Dict]:
295129
"""
296-
Search InterPro for protein domain information.
297-
298-
Automatically detects query type:
299-
- UniProt accession number → lookup pre-computed domains
300-
- Protein sequence (FASTA or raw) → submit for InterProScan analysis
130+
Search InterPro for protein domain information by UniProt accession.
301131
302132
Args:
303-
query (str): Search query (UniProt ID or protein sequence).
133+
query (str): UniProt accession number (e.g., P01308, Q96KN2).
304134
**kwargs: Additional arguments (unused).
305135
306136
Returns:
@@ -313,22 +143,9 @@ def search(self, query: str, **kwargs) -> Optional[Dict]:
313143
query = query.strip()
314144
logger.debug("InterPro search query: %s", query[:100])
315145

316-
result = None
317-
318-
# Check if UniProt accession
319-
if self._is_uniprot_accession(query):
320-
logger.debug("Detected UniProt accession: %s", query)
321-
result = self.search_by_uniprot_id(query)
322-
323-
# Check if protein sequence
324-
elif self._is_protein_sequence(query):
325-
logger.debug("Detected protein sequence (length: %d)", len(query))
326-
result = self.search_by_sequence(query)
327-
328-
else:
329-
# Try as UniProt ID first (in case format is non-standard)
330-
logger.debug("Trying as UniProt accession: %s", query)
331-
result = self.search_by_uniprot_id(query)
146+
# Search by UniProt ID
147+
logger.debug("Searching for UniProt accession: %s", query)
148+
result = self.search_by_uniprot_id(query)
332149

333150
if result:
334151
result["_search_query"] = query

0 commit comments

Comments
 (0)