11import re
2- import time
32from typing import Dict , Optional
43
54import requests
@@ -20,227 +19,29 @@ class InterProSearch(BaseSearcher):
2019 InterPro Search client to search protein domains and functional annotations.
2120 Supports:
2221 1) Get protein domain information by UniProt accession number.
23- 2) Search with protein sequence using EBI InterProScan API.
24- 3) Parse domain matches and associated GO terms, pathways.
2522
26- API Documentation: https://www.ebi.ac.uk/Tools/services/rest/iprscan5
23+ API Documentation: https://www.ebi.ac.uk/interpro/api/
2724 """
2825
2926 def __init__ (
3027 self ,
31- email : str = "graphgen@example.com" ,
3228 api_timeout : int = 30 ,
3329 ):
3430 """
3531 Initialize the InterPro Search client.
3632
3733 Args:
38- email (str): Email address for EBI API requests.
3934 api_timeout (int): Request timeout in seconds.
4035 """
41- self .base_url = "https://www.ebi.ac.uk/Tools/services/rest/iprscan5"
42- self .email = email
4336 self .api_timeout = api_timeout
44- self .poll_interval = 5 # Fixed interval between status checks
45- self .max_polls = 120 # Maximum polling attempts (10 minutes with 5s interval)
46-
47- @staticmethod
48- def _is_protein_sequence (text : str ) -> bool :
49- """Check if text looks like a protein sequence."""
50- # Remove common FASTA header prefix
51- if text .startswith (">" ):
52- text = "\n " .join (text .split ("\n " )[1 :])
53- # Check if contains mostly protein amino acids
54- text = text .strip ().replace ("\n " , "" ).replace (" " , "" )
55- # Protein sequences contain only A-Z letters (standard amino acids)
56- return bool (re .fullmatch (r"[A-Z]+" , text , re .I )) and len (text ) > 10
37+ self .BASE_URL = "https://www.ebi.ac.uk/interpro/api"
5738
5839 @staticmethod
5940 def _is_uniprot_accession (text : str ) -> bool :
6041 """Check if text looks like a UniProt accession number."""
6142 # UniProt: 6-10 chars starting with letter, e.g., P01308, Q96KN2
6243 return bool (re .fullmatch (r"[A-Z][A-Z0-9]{5,9}" , text .strip (), re .I ))
6344
64- @retry (
65- stop = stop_after_attempt (3 ),
66- wait = wait_exponential (multiplier = 1 , min = 2 , max = 5 ),
67- retry = retry_if_exception_type (RequestException ),
68- reraise = True ,
69- )
70- def _submit_job (self , sequence : str , title : str = "" ) -> Optional [str ]:
71- """
72- Submit a protein sequence for InterProScan analysis.
73-
74- Args:
75- sequence (str): Protein sequence (FASTA or raw).
76- title (str): Optional job title.
77-
78- Returns:
79- Job ID if successful, None otherwise.
80- """
81- url = f"{ self .base_url } /run"
82-
83- # Parse sequence if FASTA format
84- if sequence .startswith (">" ):
85- sequence = (
86- "\n " .join (sequence .split ("\n " )[1 :]).replace ("\n " , "" ).replace (" " , "" )
87- )
88-
89- params = {
90- "email" : self .email ,
91- "title" : title or "GraphGen_Analysis" ,
92- "sequence" : sequence ,
93- "stype" : "p" ,
94- "appl" : "NCBIfam,SMART,CDD,HAMAP" , # Multiple databases
95- "goterms" : "true" ,
96- "pathways" : "true" ,
97- "format" : "json" ,
98- }
99-
100- try :
101- response = requests .post (url , data = params , timeout = self .api_timeout )
102- if response .status_code == 200 :
103- job_id = response .text .strip ()
104- logger .debug ("InterProScan job submitted: %s" , job_id )
105- return job_id
106- logger .error (
107- "Failed to submit InterProScan job: %d - %s" ,
108- response .status_code ,
109- response .text ,
110- )
111- return None
112- except RequestException as e :
113- logger .error ("Request error while submitting job: %s" , e )
114- raise
115-
116- @retry (
117- stop = stop_after_attempt (3 ),
118- wait = wait_exponential (multiplier = 1 , min = 2 , max = 5 ),
119- retry = retry_if_exception_type (RequestException ),
120- reraise = True ,
121- )
122- def _check_status (self , job_id : str ) -> Optional [str ]:
123- """Check the status of a submitted job."""
124- url = f"{ self .base_url } /status/{ job_id } "
125- try :
126- response = requests .get (url , timeout = self .api_timeout )
127- if response .status_code == 200 :
128- return response .text .strip ()
129- logger .warning (
130- "Failed to check job status for %s: %d" ,
131- job_id ,
132- response .status_code ,
133- )
134- return None
135- except RequestException as e :
136- logger .error ("Request error while checking status: %s" , e )
137- raise
138-
139- @retry (
140- stop = stop_after_attempt (3 ),
141- wait = wait_exponential (multiplier = 1 , min = 2 , max = 5 ),
142- retry = retry_if_exception_type (RequestException ),
143- reraise = True ,
144- )
145- def _get_results (self , job_id : str ) -> Optional [dict ]:
146- """Retrieve the analysis results for a completed job."""
147- url = f"{ self .base_url } /result/{ job_id } /json"
148- try :
149- response = requests .get (url , timeout = self .api_timeout )
150- if response .status_code == 200 :
151- return response .json ()
152- logger .warning (
153- "Failed to retrieve results for job %s: %d" ,
154- job_id ,
155- response .status_code ,
156- )
157- return None
158- except RequestException as e :
159- logger .error ("Request error while retrieving results: %s" , e )
160- raise
161-
162- def _poll_job (self , job_id : str ) -> Optional [dict ]:
163- """
164- Poll a job until completion and retrieve results.
165-
166- Args:
167- job_id (str): The job ID to poll.
168-
169- Returns:
170- Results dictionary if successful, None otherwise.
171- """
172- for attempt in range (self .max_polls ):
173- status = self ._check_status (job_id )
174-
175- if status == "FINISHED" :
176- logger .debug (
177- "Job %s completed after %d polls" ,
178- job_id ,
179- attempt + 1 ,
180- )
181- return self ._get_results (job_id )
182-
183- if status in ["FAILED" , "NOT_FOUND" ]:
184- logger .warning ("Job %s has status: %s" , job_id , status )
185- return None
186-
187- if status == "RUNNING" :
188- logger .debug (
189- "Job %s still running (attempt %d/%d)" ,
190- job_id ,
191- attempt + 1 ,
192- self .max_polls ,
193- )
194- time .sleep (self .poll_interval )
195- else :
196- logger .debug ("Job %s status: %s" , job_id , status )
197- time .sleep (self .poll_interval )
198-
199- logger .warning (
200- "Job %s polling timed out after %d attempts" , job_id , self .max_polls
201- )
202- return None
203-
204- def search_by_sequence (self , sequence : str ) -> Optional [Dict ]:
205- """
206- Search for protein domains in a sequence using InterProScan API.
207-
208- Args:
209- sequence (str): Protein sequence in FASTA or raw format.
210-
211- Returns:
212- Dictionary with domain analysis results or None if failed.
213- """
214- if not sequence or not isinstance (sequence , str ):
215- logger .error ("Invalid sequence provided" )
216- return None
217-
218- sequence = sequence .strip ()
219-
220- if not self ._is_protein_sequence (sequence ):
221- logger .error ("Invalid protein sequence format" )
222- return None
223-
224- # Submit job
225- job_id = self ._submit_job (sequence )
226- if not job_id :
227- logger .error ("Failed to submit InterProScan job" )
228- return None
229-
230- # Poll for results
231- results = self ._poll_job (job_id )
232- if not results :
233- logger .error ("Failed to retrieve InterProScan results for job %s" , job_id )
234- return None
235-
236- return {
237- "molecule_type" : "protein" ,
238- "database" : "InterPro" ,
239- "job_id" : job_id ,
240- "content" : results ,
241- "url" : f"https://www.ebi.ac.uk/interpro/result/{ job_id } /" ,
242- }
243-
24445 def search_by_uniprot_id (self , accession : str ) -> Optional [Dict ]:
24546 """
24647 Search InterPro database by UniProt accession number.
@@ -261,7 +62,7 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
26162 accession = accession .strip ().upper ()
26263
26364 # Query InterPro REST API for UniProt entry
264- url = f"https://www.ebi.ac.uk/interpro/api /entry/interpro/protein/uniprot/{ accession } /"
65+ url = f"{ self . BASE_URL } /entry/interpro/protein/uniprot/{ accession } /"
26566
26667 response = requests .get (url , timeout = self .api_timeout )
26768
@@ -275,6 +76,14 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
27576
27677 data = response .json ()
27778
79+ # Get entry details for each InterPro entry found
80+ for result in data .get ("results" , []):
81+ interpro_acc = result .get ("metadata" , {}).get ("accession" )
82+ if interpro_acc :
83+ entry_details = self .get_entry_details (interpro_acc )
84+ if entry_details :
85+ result ["entry_details" ] = entry_details
86+
27887 result = {
27988 "molecule_type" : "protein" ,
28089 "database" : "InterPro" ,
@@ -285,6 +94,31 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
28594
28695 return result
28796
97+ def get_entry_details (self , interpro_accession : str ) -> Optional [Dict ]:
98+ """
99+ Get detailed information for a specific InterPro entry.
100+
101+ Args:
102+ interpro_accession (str): InterPro accession number (e.g., IPR000001).
103+ Returns:
104+ Dictionary with entry details or None if not found.
105+ """
106+ if not interpro_accession or not isinstance (interpro_accession , str ):
107+ return None
108+
109+ url = f"{ self .BASE_URL } /entry/interpro/{ interpro_accession } /"
110+
111+ response = requests .get (url , timeout = self .api_timeout )
112+ if response .status_code != 200 :
113+ logger .warning (
114+ "Failed to get InterPro entry %s: %d" ,
115+ interpro_accession ,
116+ response .status_code ,
117+ )
118+ return None
119+
120+ return response .json ()
121+
288122 @retry (
289123 stop = stop_after_attempt (3 ),
290124 wait = wait_exponential (multiplier = 1 , min = 2 , max = 5 ),
@@ -293,14 +127,10 @@ def search_by_uniprot_id(self, accession: str) -> Optional[Dict]:
293127 )
294128 def search (self , query : str , ** kwargs ) -> Optional [Dict ]:
295129 """
296- Search InterPro for protein domain information.
297-
298- Automatically detects query type:
299- - UniProt accession number → lookup pre-computed domains
300- - Protein sequence (FASTA or raw) → submit for InterProScan analysis
130+ Search InterPro for protein domain information by UniProt accession.
301131
302132 Args:
303- query (str): Search query ( UniProt ID or protein sequence ).
133+ query (str): UniProt accession number (e.g., P01308, Q96KN2 ).
304134 **kwargs: Additional arguments (unused).
305135
306136 Returns:
@@ -313,22 +143,9 @@ def search(self, query: str, **kwargs) -> Optional[Dict]:
313143 query = query .strip ()
314144 logger .debug ("InterPro search query: %s" , query [:100 ])
315145
316- result = None
317-
318- # Check if UniProt accession
319- if self ._is_uniprot_accession (query ):
320- logger .debug ("Detected UniProt accession: %s" , query )
321- result = self .search_by_uniprot_id (query )
322-
323- # Check if protein sequence
324- elif self ._is_protein_sequence (query ):
325- logger .debug ("Detected protein sequence (length: %d)" , len (query ))
326- result = self .search_by_sequence (query )
327-
328- else :
329- # Try as UniProt ID first (in case format is non-standard)
330- logger .debug ("Trying as UniProt accession: %s" , query )
331- result = self .search_by_uniprot_id (query )
146+ # Search by UniProt ID
147+ logger .debug ("Searching for UniProt accession: %s" , query )
148+ result = self .search_by_uniprot_id (query )
332149
333150 if result :
334151 result ["_search_query" ] = query
0 commit comments