helxplatform · YaphetKG · Dec 19, 2025 · Apr 15, 2025 · Apr 23, 2025 · May 12, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -3,21 +3,20 @@
 # A container for the core semantic-search capability.
 #
 ######################################################
-FROM python:3.12-alpine3.21
+FROM python:3.13-alpine3.22
 
 
 # Install required packages
 RUN apk update && \
     apk add g++ make cargo rust
 
 RUN apk upgrade -Ua
-RUN apk add "libxml2=2.13.4-r6"
 
 RUN pip install --upgrade pip
 # Create a non-root user.
-ENV USER dug
-ENV HOME /home/$USER
-ENV UID 1000
+ENV USER=dug
+ENV HOME=/home/$USER
+ENV UID=1000
 
 RUN adduser -D --home $HOME  --uid $UID $USER
 

diff --git a/Makefile b/Makefile
@@ -1,10 +1,10 @@
 PYTHON       = $(shell which python3)
 VERSION_FILE = ./src/dug/_version.py
 VERSION      = $(shell cut -d " " -f 3 ${VERSION_FILE})
-DOCKER_REPO  = docker.io
-DOCKER_OWNER = rti
+DOCKER_REPO  = containers.renci.org
+DOCKER_OWNER = helxplatform
 DOCKER_APP	 = dug
-DOCKER_TAG   = ${VERSION}
+DOCKER_TAG   = data-model-2.0-9-9
 DOCKER_IMAGE = ${DOCKER_OWNER}/${DOCKER_APP}:$(DOCKER_TAG)
 export PYTHONPATH = $(shell echo ${PWD})/src
 
@@ -48,7 +48,7 @@ coverage:
 #build: Build Docker image
 build:
 	echo "Building docker image: ${DOCKER_IMAGE}"
-	docker build -t ${DOCKER_IMAGE} -f Dockerfile .
+	docker build --platform=linux/amd64 -t ${DOCKER_IMAGE} -f Dockerfile .
 	echo "Successfully built: ${DOCKER_IMAGE}"
 
 #publish: Build and push docker image

diff --git a/bin/export_ddm_as_json_schema.py b/bin/export_ddm_as_json_schema.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+#
+# export_ddm_as_json_schema.py - Export Dug Data Model as JSON Schema
+#
+# SYNOPSIS
+#   PYTHONPATH=src python bin/export_ddm_as_json_schema.py
+#
+
+import click
+import json
+import logging
+
+from dug.core.parsers._base import DugStudy, DugSection, DugVariable
+
+logging.basicConfig(level=logging.INFO)
+
+@click.command()
+def export_ddm_as_json_schema():
+    """
+
+    :return:
+    """
+    logging.info("Exporting Dug Data Model as JSON Schema")
+
+    json_schema = {
+        '$schema': 'https://json-schema.org/draft/2020-12/schema',
+            # This is what Pydantic supports: https://docs.pydantic.dev/latest/api/json_schema/#pydantic.json_schema.GenerateJsonSchema
+        'definitions': {
+            'DugSection': DugSection.model_json_schema(),
+            'DugVariable': DugVariable.model_json_schema(),
+            'DugStudy': DugStudy.model_json_schema()
+        },
+        # We want to validate a list of heterogenous objects: each item in the list may be any of the Dug objects above.
+        'type': 'array',
+        'items': {
+            'oneOf': [
+                {'$ref': '#/definitions/DugSection'},
+                {'$ref': '#/definitions/DugVariable'},
+                {'$ref': '#/definitions/DugStudy'}
+            ]
+        }
+    }
+
+    print(json.dumps(json_schema, indent=2))
+
+
+if __name__ == '__main__':
+    export_ddm_as_json_schema()
diff --git a/pytest.ini b/pytest.ini
@@ -6,4 +6,5 @@ markers =
     api: mark a test as an api test
     cli: mark a test as a cli test
 testpaths =
-    tests
+    tests
+pythonpath = src
diff --git a/requirements.txt b/requirements.txt
@@ -12,7 +12,7 @@ MarkupSafe
 ormar
 mistune
 pluggy
-pydantic==2.9.2
+pydantic==2.12.3
 pyrsistent
 pytest
 pytest-asyncio

diff --git a/src/dug/api_models/__init__.py b/src/dug/api_models/__init__.py
diff --git a/src/dug/api_models/request_models.py b/src/dug/api_models/request_models.py
@@ -0,0 +1,50 @@
+from pydantic import BaseModel, field_validator
+from typing import List, Optional, Any
+
+class GetFromIndex(BaseModel):
+    size: int = 0
+
+class SearchConceptQuery(BaseModel):
+    query: str
+    offset: int = 0
+    size: int = 20
+    concept_types: list = None
+
+class SearchVariablesQuery(BaseModel):
+    query: str
+    concept: str = ""
+    offset: int = 0
+    size: int = 1000
+
+class FilterGrouped(BaseModel):
+    key: str
+    value: List[Any]
+class SearchVariablesQueryFiltered(SearchVariablesQuery):
+    filter: List[FilterGrouped] = []
+
+class SearchKgQuery(BaseModel):
+    query: str
+    unique_id: str
+    index: str = "kg_index"
+    size:int = 100
+
+class SearchElementQuery(BaseModel):
+    query: str = None
+    parent_ids: Optional[List] = None
+    element_ids: Optional[List] = None
+    concept: Optional[str] = None
+    size: Optional[int] = 100
+    offset: Optional[int] = 0
+
+    @field_validator("parent_ids", "element_ids", mode="before")
+    @classmethod
+    def drop_empty_strings(cls, v):
+        if v is None:
+            return v
+        return [item for item in v if item not in ("", None)]
+
+class VariableIds(BaseModel):
+    """
+    List of variable IDs
+    """
+    ids: Optional[List[str]] = []
diff --git a/src/dug/api_models/response_models.py b/src/dug/api_models/response_models.py
@@ -0,0 +1,69 @@
+from dug.core.parsers._base import *
+from pydantic import BaseModel, model_serializer
+from typing import Optional, Any
+
+
+class ElasticResultMetaData(BaseModel):
+    total_count: int
+    offset: int
+    size: int
+
+
+class ElasticDugElementResult(BaseModel):
+    # Class for all entities from elastic search, we are going to have score... optionally explanation
+    score: float = Field(default=999)
+    explanation: dict = Field(default_factory=dict)
+    # we are going to ignore concepts...
+    concepts: None = Field(default=None, exclude=True)
+
+
+class DugAPIResponse(BaseModel):
+    results: List[ElasticDugElementResult]
+    metadata: Optional[ElasticResultMetaData] = Field(default_factory=dict)
+
+
+class ConceptResponse(ElasticDugElementResult, DugConcept):
+    identifiers: List[Any]
+    concepts: None = Field(default=None, exclude=True)
+
+
+class ConceptsAPIResponse(BaseModel):
+    metadata: ElasticResultMetaData
+    results: List[ConceptResponse]
+    concept_types: dict = Field(default="")
+
+
+class VariableResponse(ElasticDugElementResult, DugVariable):
+    @model_serializer
+    def serialize(self):
+        response = self.get_response_dict()
+        return response
+
+
+class VariablesAPIResponse(DugAPIResponse):
+    results: List[VariableResponse]
+
+
+class StudyResponse(ElasticDugElementResult, DugStudy):
+    @model_serializer
+    def serialize(self):
+        response = self.get_response_dict()
+        response.pop('abstract')
+        return response
+
+
+class StudyAPIResponse(DugAPIResponse):
+    results: List[StudyResponse]
+
+
+class SectionResponse(ElasticDugElementResult, DugSection):
+    @model_serializer
+    def serialize(self):
+        response = self.get_response_dict()
+        return response
+
+
+class SectionAPIResponse(DugAPIResponse):
+    results: List[SectionResponse]
+
+
diff --git a/src/dug/cli.py b/src/dug/cli.py
@@ -59,10 +59,9 @@ def get_argparser():
     )
 
     crawl_parser.add_argument(
-        '-e', '--element-type',
-        help='[Optional] Coerce all elements to a certain data type (e.g. DbGaP Variable).\n'
-             'Determines what tab elements will appear under in Dug front-end',
-        dest="element_type",
+        '-e', '--program_name',
+        help='[Optional] Coerce all elements to a certain program (e.g. HEAL/RADX/DBGap/etc).\n',
+        dest="program_name",
         default=None
     )
 
@@ -115,7 +114,7 @@ def crawl(args):
         config.node_to_element_queries = {}
     factory = DugFactory(config)
     dug = Dug(factory)
-    dug.crawl(args.target, args.parser_type, args.annotator_type, args.element_type)
+    dug.crawl(args.target, args.parser_type, args.annotator_type, args.program_name)
 
 
 def search(args):
@@ -126,7 +125,6 @@ def search(args):
     response = dug.search(args.target, args.query, **args.kwargs)
     # Using json.dumps raises 'TypeError: Object of type ObjectApiResponse is not JSON serializable'
     #jsonResponse = json.dumps(response, indent = 2)
-    print(response)
 
 def datatypes(args):
     config = Config.from_env()
@@ -137,7 +135,7 @@ def datatypes(args):
 
 
 def status(args):
-    print("Status check is not implemented yet!")
+    logger.warning("Status check is not implemented yet!")
 
 
 def main(args=None):

diff --git a/src/dug/config.py b/src/dug/config.py
@@ -9,7 +9,7 @@
 @dataclass
 class Config:
     """
-    TODO: Populate description
+    TODO: Make all URLs available as enviroment variables.
     """
 
     elastic_password: str = "changeme"
@@ -21,6 +21,7 @@ class Config:
     elastic_scheme: str = "https"
     elastic_ca_path: str = ""
     elastic_ca_verify: bool = True
+    max_ids_limit = 10000
 
     redis_host: str = "redis"
     redis_port: int = 6379
@@ -30,7 +31,11 @@ class Config:
 
     studies_path: str=""
 
-
+    kg_index_name: str="kg_index"
+    concepts_index_name: str="concepts_index"
+    variables_index_name: str='variables_index'
+    studies_index_name: str='studies_index'
+    sections_index_name: str='sections_index'
 
     # Preprocessor config that will be passed to annotate.Preprocessor constructor
     preprocessor: dict = field(
@@ -47,8 +52,8 @@ class Config:
                 "url": "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content="
             },
             "sapbert": {
-                "classification_url": "https://med-nemo.apps.renci.org/annotate/",
-                "annotator_url": "https://sap-qdrant.apps.renci.org/annotate/",
+                "classification_url": "http://med-nemo-serve-nemo-web-server.ner/annotate/",
+                "annotator_url": "http://qdrant-sapbert-nemo-web-server.ner/annotate/",
                 "score_threshold": 0.8,
                 "bagel": {
                     "enabled": False,
@@ -71,14 +76,14 @@ class Config:
     # Normalizer config that will be passed to annotate.Normalizer constructor
     normalizer: dict = field(
         default_factory=lambda: {
-            "url": "https://nodenormalization-dev.apps.renci.org/get_normalized_nodes?conflate=false&description=true&curie="
+            "url": "http://nn-web-node-normalization-web-service-root.translator-dev:8080/get_normalized_nodes?conflate=false&description=true&curie="
         }
     )
 
     # Synonym service config that will be passed to annotate.SynonymHelper constructor
     synonym_service: dict = field(
         default_factory=lambda: {
-            "url": "https://name-resolution-sri.renci.org/reverse_lookup"
+            "url": "http://name-resolution-name-lookup-web-svc.translator-dev:2433/synonyms"
         }
     )
 
@@ -127,7 +132,7 @@ class Config:
 
     concept_expander: dict = field(
         default_factory=lambda: {
-            "url": "https://tranql-dev.renci.org/tranql/query?dynamic_id_resolution=true&asynchronous=false",
+            "url": "http://search-tranql:8081/tranql/tranql/query?dynamic_id_resolution=true&asynchronous=false",
             "min_tranql_score": 0.0,
         }
     )
@@ -159,10 +164,13 @@ def from_env(cls):
             "redis_port": "REDIS_PORT",
             "redis_password": "REDIS_PASSWORD",
             "studies_path": "STUDIES_PATH",
+            "kg_index_name": "ELASTIC_KG_INDEX_NAME",
+            "concepts_index_name": "ELASTIC_CONCEPTS_INDEX_NAME",
+            "variables_index_name": "ELASTIC_VARIABLES_INDEX_NAME",
+            "studies_index_name": "ELASTIC_STUDIES_INDEX_NAME",
+            "sections_index_name": "ELASTIC_SECTIONS_INDEX_NAME",
         }
-
         kwargs = {}
-
         for kwarg, env_var in env_vars.items():
             env_value = os.environ.get(env_var)
             if env_value: