From 47a5e82280133b3bf81d23ceef3b54c9edd2b845 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Sun, 12 May 2024 12:15:35 +0300 Subject: [PATCH 01/65] process uploaded file --- src/controllers/ProcessController.py | 69 ++++++++++++++++++++++++++++ src/controllers/__init__.py | 1 + src/models/__init__.py | 2 + src/models/enums/ProcessingEnum.py | 6 +++ src/models/enums/ResponseEnums.py | 2 + src/requirements.txt | 2 + src/routes/data.py | 32 ++++++++++++- src/routes/schemes/__init__.py | 0 src/routes/schemes/data.py | 8 ++++ 9 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 src/controllers/ProcessController.py create mode 100644 src/models/enums/ProcessingEnum.py create mode 100644 src/routes/schemes/__init__.py create mode 100644 src/routes/schemes/data.py diff --git a/src/controllers/ProcessController.py b/src/controllers/ProcessController.py new file mode 100644 index 00000000..310934e4 --- /dev/null +++ b/src/controllers/ProcessController.py @@ -0,0 +1,69 @@ +from .BaseController import BaseController +from .ProjectController import ProjectController +import os +from langchain_community.document_loaders import TextLoader +from langchain_community.document_loaders import PyMuPDFLoader +from langchain_text_splitters import RecursiveCharacterTextSplitter +from models import ProcessingEnum + +class ProcessController(BaseController): + + def __init__(self, project_id: str): + super().__init__() + + self.project_id = project_id + self.project_path = ProjectController().get_project_path(project_id=project_id) + + def get_file_extension(self, file_id: str): + return os.path.splitext(file_id)[-1] + + def get_file_loader(self, file_id: str): + + file_ext = self.get_file_extension(file_id=file_id) + file_path = os.path.join( + self.project_path, + file_id + ) + + if file_ext == ProcessingEnum.TXT.value: + return TextLoader(file_path, encoding="utf-8") + + if file_ext == ProcessingEnum.PDF.value: + return PyMuPDFLoader(file_path) + + return None + + def get_file_content(self, file_id: str): + + loader = self.get_file_loader(file_id=file_id) + return loader.load() + + def process_file_content(self, file_content: list, file_id: str, + chunk_size: int=100, overlap_size: int=20): + + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, + chunk_overlap=overlap_size, + length_function=len, + ) + + file_content_texts = [ + rec.page_content + for rec in file_content + ] + + file_content_metadata = [ + rec.metadata + for rec in file_content + ] + + chunks = text_splitter.create_documents( + file_content_texts, + metadatas=file_content_metadata + ) + + return chunks + + + + diff --git a/src/controllers/__init__.py b/src/controllers/__init__.py index 9007dbd8..0ca8209e 100644 --- a/src/controllers/__init__.py +++ b/src/controllers/__init__.py @@ -1,2 +1,3 @@ from .DataController import DataController from .ProjectController import ProjectController +from .ProcessController import ProcessController diff --git a/src/models/__init__.py b/src/models/__init__.py index b30f5a49..3b41f8a2 100644 --- a/src/models/__init__.py +++ b/src/models/__init__.py @@ -1 +1,3 @@ from .enums.ResponseEnums import ResponseSignal +from .enums.ProcessingEnum import ProcessingEnum + diff --git a/src/models/enums/ProcessingEnum.py b/src/models/enums/ProcessingEnum.py new file mode 100644 index 00000000..f6f4c3a7 --- /dev/null +++ b/src/models/enums/ProcessingEnum.py @@ -0,0 +1,6 @@ +from enum import Enum + +class ProcessingEnum(Enum): + + TXT = ".txt" + PDF = ".pdf" diff --git a/src/models/enums/ResponseEnums.py b/src/models/enums/ResponseEnums.py index 684376c0..28aee71e 100644 --- a/src/models/enums/ResponseEnums.py +++ b/src/models/enums/ResponseEnums.py @@ -7,3 +7,5 @@ class ResponseSignal(Enum): FILE_SIZE_EXCEEDED = "file_size_exceeded" FILE_UPLOAD_SUCCESS = "file_upload_success" FILE_UPLOAD_FAILED = "file_upload_failed" + PROCESSING_SUCCESS = "processing_success" + PROCESSING_FAILED = "processing_failed" diff --git a/src/requirements.txt b/src/requirements.txt index 6a7b0e6d..3deda40d 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -4,3 +4,5 @@ python-multipart==0.0.9 python-dotenv==1.0.1 pydantic-settings==2.2.1 aiofiles==23.2.1 +langchain==0.1.20 +PyMuPDF==1.24.3 diff --git a/src/routes/data.py b/src/routes/data.py index 97ee5319..51f877f3 100644 --- a/src/routes/data.py +++ b/src/routes/data.py @@ -2,10 +2,11 @@ from fastapi.responses import JSONResponse import os from helpers.config import get_settings, Settings -from controllers import DataController, ProjectController +from controllers import DataController, ProjectController, ProcessController import aiofiles from models import ResponseSignal import logging +from .schemes.data import ProcessRequest logger = logging.getLogger('uvicorn.error') @@ -59,3 +60,32 @@ async def upload_data(project_id: str, file: UploadFile, "file_id": file_id } ) + +@data_router.post("/process/{project_id}") +async def process_endpoint(project_id: str, process_request: ProcessRequest): + + file_id = process_request.file_id + chunk_size = process_request.chunk_size + overlap_size = process_request.overlap_size + + process_controller = ProcessController(project_id=project_id) + + file_content = process_controller.get_file_content(file_id=file_id) + + file_chunks = process_controller.process_file_content( + file_content=file_content, + file_id=file_id, + chunk_size=chunk_size, + overlap_size=overlap_size + ) + + if file_chunks is None or len(file_chunks) == 0: + return JSONResponse( + status_code=status.HTTP_400_BAD_REQUEST, + content={ + "signal": ResponseSignal.PROCESSING_FAILED.value + } + ) + + return file_chunks + diff --git a/src/routes/schemes/__init__.py b/src/routes/schemes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/routes/schemes/data.py b/src/routes/schemes/data.py new file mode 100644 index 00000000..a2fee336 --- /dev/null +++ b/src/routes/schemes/data.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel +from typing import Optional + +class ProcessRequest(BaseModel): + file_id: str + chunk_size: Optional[int] = 100 + overlap_size: Optional[int] = 20 + do_reset: Optional[int] = 0 From 8fff7f66a0c2af07d4ec1a3a002d36f5c394c8bc Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Sun, 12 May 2024 13:25:28 +0300 Subject: [PATCH 02/65] setup docker compose --- README.md | 6 ++++++ docker/.gitignore | 1 + docker/docker-compose.yml | 19 +++++++++++++++++++ 3 files changed, 26 insertions(+) create mode 100644 docker/.gitignore create mode 100644 docker/docker-compose.yml diff --git a/README.md b/README.md index c149d47e..8596aa2f 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,12 @@ $ cp .env.example .env Set your environment variables in the `.env` file. Like `OPENAI_API_KEY` value. +## Run Docker Compose Services + +```bash +$ sudo docker compose up -d +``` + ## Run the FastAPI server ```bash diff --git a/docker/.gitignore b/docker/.gitignore new file mode 100644 index 00000000..97ee1da5 --- /dev/null +++ b/docker/.gitignore @@ -0,0 +1 @@ +mongodb diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml new file mode 100644 index 00000000..20372407 --- /dev/null +++ b/docker/docker-compose.yml @@ -0,0 +1,19 @@ +services: + mongodb: + image: mongo:7-jammy + + container_name: mongodb + + ports: + - "27007:27017" + + volumes: + - ./mongodb:/data/db + + networks: + - backend + + restart: always + +networks: + backend: From b29742a95bde2a49a5f4887655ecbb00ae59b906 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Sun, 12 May 2024 13:25:47 +0300 Subject: [PATCH 03/65] setup database schemes --- src/.env.example | 3 +++ src/helpers/config.py | 3 +++ src/main.py | 15 +++++++++++++++ src/models/db_schemes/__init__.py | 2 ++ src/models/db_schemes/data_chunk.py | 14 ++++++++++++++ src/models/db_schemes/project.py | 17 +++++++++++++++++ src/requirements.txt | 1 + 7 files changed, 55 insertions(+) create mode 100644 src/models/db_schemes/__init__.py create mode 100644 src/models/db_schemes/data_chunk.py create mode 100644 src/models/db_schemes/project.py diff --git a/src/.env.example b/src/.env.example index 60ec87a5..0b6a6f37 100644 --- a/src/.env.example +++ b/src/.env.example @@ -5,3 +5,6 @@ OPENAI_API_KEY="" FILE_ALLOWED_TYPES= FILE_MAX_SIZE=10 FILE_DEFAULT_CHUNK_SIZE=512000 # 512KB + = +MONGODB_URL= +MONGODB_DATABASE= diff --git a/src/helpers/config.py b/src/helpers/config.py index e93f99d5..3d8ea7fc 100644 --- a/src/helpers/config.py +++ b/src/helpers/config.py @@ -10,6 +10,9 @@ class Settings(BaseSettings): FILE_MAX_SIZE: int FILE_DEFAULT_CHUNK_SIZE: int + MONGODB_URL: str + MONGODB_DATABASE: str + class Config: env_file = ".env" diff --git a/src/main.py b/src/main.py index 79cee57d..168b3265 100644 --- a/src/main.py +++ b/src/main.py @@ -1,6 +1,21 @@ from fastapi import FastAPI from routes import base, data +from motor.motor_asyncio import AsyncIOMotorClient +from helpers.config import get_settings app = FastAPI() + +@app.on_event("startup") +async def startup_db_client(): + settings = get_settings() + app.mongo_conn = AsyncIOMotorClient(settings.MONGODB_URL) + app.db_client = app.mongo_conn[settings.MONGODB_DATABASE] + +@app.on_event("shutdown") +async def shutdown_db_client(): + app.mongo_conn.close() + + app.include_router(base.base_router) app.include_router(data.data_router) + diff --git a/src/models/db_schemes/__init__.py b/src/models/db_schemes/__init__.py new file mode 100644 index 00000000..6f4f9d88 --- /dev/null +++ b/src/models/db_schemes/__init__.py @@ -0,0 +1,2 @@ +from .project import Project +from .data_chunk import DataChunk diff --git a/src/models/db_schemes/data_chunk.py b/src/models/db_schemes/data_chunk.py new file mode 100644 index 00000000..a3f1b3ae --- /dev/null +++ b/src/models/db_schemes/data_chunk.py @@ -0,0 +1,14 @@ +from pydantic import BaseModel, Field, validator +from typing import Optional +from bson.objectid import ObjectId + +class DataChunk(BaseModel): + _id: Optional[ObjectId] + chunk_text: str = Field(..., min_length=1) + chunk_metadata: dict + chunk_order: int = Field(..., gt=0) + chunk_project_id: ObjectId + + class Config: + arbitrary_types_allowed = True + diff --git a/src/models/db_schemes/project.py b/src/models/db_schemes/project.py new file mode 100644 index 00000000..1e7f41b8 --- /dev/null +++ b/src/models/db_schemes/project.py @@ -0,0 +1,17 @@ +from pydantic import BaseModel, Field, validator +from typing import Optional +from bson.objectid import ObjectId + +class Project(BaseModel): + _id: Optional[ObjectId] + project_id: str = Field(..., min_length=1) + + @validator('project_id') + def validate_project_id(cls, value): + if not value.isalnum(): + raise ValueError('project_id must be alphanumeric') + + return value + + class Config: + arbitrary_types_allowed = True diff --git a/src/requirements.txt b/src/requirements.txt index 3deda40d..69114f4b 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -6,3 +6,4 @@ pydantic-settings==2.2.1 aiofiles==23.2.1 langchain==0.1.20 PyMuPDF==1.24.3 +motor==3.4.0 From 83727af6becb06849a44962b9597e7835e570f54 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Tue, 28 May 2024 15:29:20 +0300 Subject: [PATCH 04/65] Push projects and data chunks into mongoDB --- src/models/BaseDataModel.py | 7 ++++ src/models/ChunkModel.py | 51 ++++++++++++++++++++++++++ src/models/ProjectModel.py | 50 ++++++++++++++++++++++++++ src/models/db_schemes/data_chunk.py | 2 +- src/models/db_schemes/project.py | 2 +- src/models/enums/DataBaseEnum.py | 7 ++++ src/requirements.txt | 1 + src/routes/data.py | 55 ++++++++++++++++++++++++++--- 8 files changed, 168 insertions(+), 7 deletions(-) create mode 100644 src/models/BaseDataModel.py create mode 100644 src/models/ChunkModel.py create mode 100644 src/models/ProjectModel.py create mode 100644 src/models/enums/DataBaseEnum.py diff --git a/src/models/BaseDataModel.py b/src/models/BaseDataModel.py new file mode 100644 index 00000000..ab9bace1 --- /dev/null +++ b/src/models/BaseDataModel.py @@ -0,0 +1,7 @@ +from helpers.config import get_settings, Settings + +class BaseDataModel: + + def __init__(self, db_client: object): + self.db_client = db_client + self.app_settings = get_settings() diff --git a/src/models/ChunkModel.py b/src/models/ChunkModel.py new file mode 100644 index 00000000..513ef954 --- /dev/null +++ b/src/models/ChunkModel.py @@ -0,0 +1,51 @@ +from .BaseDataModel import BaseDataModel +from .db_schemes import DataChunk +from .enums.DataBaseEnum import DataBaseEnum +from bson.objectid import ObjectId +from pymongo import InsertOne + +class ChunkModel(BaseDataModel): + + def __init__(self, db_client: object): + super().__init__(db_client=db_client) + self.collection = self.db_client[DataBaseEnum.COLLECTION_CHUNK_NAME.value] + + async def create_chunk(self, chunk: DataChunk): + result = await self.collection.insert_one(chunk.dict(by_alias=True, exclude_unset=True)) + chunk._id = result.inserted_id + return chunk + + async def get_chunk(self, chunk_id: str): + result = await self.collection.find_one({ + "_id": ObjectId(chunk_id) + }) + + if result is None: + return None + + return DataChunk(**result) + + async def insert_many_chunks(self, chunks: list, batch_size: int=100): + + for i in range(0, len(chunks), batch_size): + batch = chunks[i:i+batch_size] + + operations = [ + InsertOne(chunk.dict(by_alias=True, exclude_unset=True)) + for chunk in batch + ] + + await self.collection.bulk_write(operations) + + return len(chunks) + + async def delete_chunks_by_project_id(self, project_id: ObjectId): + result = await self.collection.delete_many({ + "chunk_project_id": project_id + }) + + return result.deleted_count + + + + diff --git a/src/models/ProjectModel.py b/src/models/ProjectModel.py new file mode 100644 index 00000000..75dfa7ee --- /dev/null +++ b/src/models/ProjectModel.py @@ -0,0 +1,50 @@ +from .BaseDataModel import BaseDataModel +from .db_schemes import Project +from .enums.DataBaseEnum import DataBaseEnum + +class ProjectModel(BaseDataModel): + + def __init__(self, db_client: object): + super().__init__(db_client=db_client) + self.collection = self.db_client[DataBaseEnum.COLLECTION_PROJECT_NAME.value] + + async def create_project(self, project: Project): + + result = await self.collection.insert_one(project.dict(by_alias=True, exclude_unset=True)) + project._id = result.inserted_id + + return project + + async def get_project_or_create_one(self, project_id: str): + + record = await self.collection.find_one({ + "project_id": project_id + }) + + if record is None: + # create new project + project = Project(project_id=project_id) + project = await self.create_project(project=project) + + return project + + return Project(**record) + + async def get_all_projects(self, page: int=1, page_size: int=10): + + # count total number of documents + total_documents = await self.collection.count_documents({}) + + # calculate total number of pages + total_pages = total_documents // page_size + if total_documents % page_size > 0: + total_pages += 1 + + cursor = self.collection.find().skip( (page-1) * page_size ).limit(page_size) + projects = [] + async for document in cursor: + projects.append( + Project(**document) + ) + + return projects, total_pages diff --git a/src/models/db_schemes/data_chunk.py b/src/models/db_schemes/data_chunk.py index a3f1b3ae..e7c9a7fc 100644 --- a/src/models/db_schemes/data_chunk.py +++ b/src/models/db_schemes/data_chunk.py @@ -3,7 +3,7 @@ from bson.objectid import ObjectId class DataChunk(BaseModel): - _id: Optional[ObjectId] + id: Optional[ObjectId] = Field(None, alias="_id") chunk_text: str = Field(..., min_length=1) chunk_metadata: dict chunk_order: int = Field(..., gt=0) diff --git a/src/models/db_schemes/project.py b/src/models/db_schemes/project.py index 1e7f41b8..af66701a 100644 --- a/src/models/db_schemes/project.py +++ b/src/models/db_schemes/project.py @@ -3,7 +3,7 @@ from bson.objectid import ObjectId class Project(BaseModel): - _id: Optional[ObjectId] + id: Optional[ObjectId] = Field(None, alias="_id") project_id: str = Field(..., min_length=1) @validator('project_id') diff --git a/src/models/enums/DataBaseEnum.py b/src/models/enums/DataBaseEnum.py new file mode 100644 index 00000000..728edfee --- /dev/null +++ b/src/models/enums/DataBaseEnum.py @@ -0,0 +1,7 @@ +from enum import Enum + +class DataBaseEnum(Enum): + + COLLECTION_PROJECT_NAME = "projects" + COLLECTION_CHUNK_NAME = "chunks" + diff --git a/src/requirements.txt b/src/requirements.txt index 69114f4b..0cca06b9 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -7,3 +7,4 @@ aiofiles==23.2.1 langchain==0.1.20 PyMuPDF==1.24.3 motor==3.4.0 +pydantic-mongo==2.3.0 diff --git a/src/routes/data.py b/src/routes/data.py index 51f877f3..deadd4b5 100644 --- a/src/routes/data.py +++ b/src/routes/data.py @@ -1,4 +1,4 @@ -from fastapi import FastAPI, APIRouter, Depends, UploadFile, status +from fastapi import FastAPI, APIRouter, Depends, UploadFile, status, Request from fastapi.responses import JSONResponse import os from helpers.config import get_settings, Settings @@ -7,6 +7,9 @@ from models import ResponseSignal import logging from .schemes.data import ProcessRequest +from models.ProjectModel import ProjectModel +from models.ChunkModel import ChunkModel +from models.db_schemes import DataChunk logger = logging.getLogger('uvicorn.error') @@ -16,10 +19,18 @@ ) @data_router.post("/upload/{project_id}") -async def upload_data(project_id: str, file: UploadFile, +async def upload_data(request: Request, project_id: str, file: UploadFile, app_settings: Settings = Depends(get_settings)): + project_model = ProjectModel( + db_client=request.app.db_client + ) + + project = await project_model.get_project_or_create_one( + project_id=project_id + ) + # validate the file properties data_controller = DataController() @@ -57,16 +68,25 @@ async def upload_data(project_id: str, file: UploadFile, return JSONResponse( content={ "signal": ResponseSignal.FILE_UPLOAD_SUCCESS.value, - "file_id": file_id + "file_id": file_id, } ) @data_router.post("/process/{project_id}") -async def process_endpoint(project_id: str, process_request: ProcessRequest): +async def process_endpoint(request: Request, project_id: str, process_request: ProcessRequest): file_id = process_request.file_id chunk_size = process_request.chunk_size overlap_size = process_request.overlap_size + do_reset = process_request.do_reset + + project_model = ProjectModel( + db_client=request.app.db_client + ) + + project = await project_model.get_project_or_create_one( + project_id=project_id + ) process_controller = ProcessController(project_id=project_id) @@ -87,5 +107,30 @@ async def process_endpoint(project_id: str, process_request: ProcessRequest): } ) - return file_chunks + file_chunks_records = [ + DataChunk( + chunk_text=chunk.page_content, + chunk_metadata=chunk.metadata, + chunk_order=i+1, + chunk_project_id=project.id, + ) + for i, chunk in enumerate(file_chunks) + ] + chunk_model = ChunkModel( + db_client=request.app.db_client + ) + + if do_reset == 1: + _ = await chunk_model.delete_chunks_by_project_id( + project_id=project.id + ) + + no_records = await chunk_model.insert_many_chunks(chunks=file_chunks_records) + + return JSONResponse( + content={ + "signal": ResponseSignal.PROCESSING_SUCCESS.value, + "inserted_chunks": no_records + } + ) From ec730559edea6a4b95fc3b76004414afd2da08e5 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Wed, 19 Jun 2024 18:45:00 +0300 Subject: [PATCH 05/65] update docker configurations --- README.md | 10 ++++++++++ docker/.env.example | 2 ++ docker/.gitignore | 1 + docker/docker-compose.yml | 9 ++++++++- 4 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 docker/.env.example diff --git a/README.md b/README.md index 8596aa2f..a26fdcd5 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,16 @@ Set your environment variables in the `.env` file. Like `OPENAI_API_KEY` value. ## Run Docker Compose Services ```bash +$ cd docker +$ cp .env.example .env +``` + +- update `.env` with your credentials + + + +```bash +$ cd docker $ sudo docker compose up -d ``` diff --git a/docker/.env.example b/docker/.env.example new file mode 100644 index 00000000..0cbf7103 --- /dev/null +++ b/docker/.env.example @@ -0,0 +1,2 @@ +MONGO_INITDB_ROOT_USERNAME= +MONGO_INITDB_ROOT_PASSWORD= diff --git a/docker/.gitignore b/docker/.gitignore index 97ee1da5..26477da4 100644 --- a/docker/.gitignore +++ b/docker/.gitignore @@ -1 +1,2 @@ mongodb +.env diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 20372407..9905af07 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -8,7 +8,11 @@ services: - "27007:27017" volumes: - - ./mongodb:/data/db + - mongodata:/data/db + + environment: + - MONGO_INITDB_ROOT_USERNAME=${MONGO_INITDB_ROOT_USERNAME} + - MONGO_INITDB_ROOT_PASSWORD=${MONGO_INITDB_ROOT_PASSWORD} networks: - backend @@ -17,3 +21,6 @@ services: networks: backend: + +volumes: + mongodata: From e9c4c24ead069381a4156c8fb407e65a96d57a82 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Wed, 19 Jun 2024 18:45:30 +0300 Subject: [PATCH 06/65] set indecies for projects + chunks --- src/.env.example | 4 ++-- src/models/ChunkModel.py | 18 ++++++++++++++++++ src/models/ProjectModel.py | 19 +++++++++++++++++++ src/models/db_schemes/data_chunk.py | 11 +++++++++++ src/models/db_schemes/project.py | 13 +++++++++++++ src/routes/data.py | 6 +++--- 6 files changed, 66 insertions(+), 5 deletions(-) diff --git a/src/.env.example b/src/.env.example index 0b6a6f37..08833a34 100644 --- a/src/.env.example +++ b/src/.env.example @@ -6,5 +6,5 @@ FILE_ALLOWED_TYPES= FILE_MAX_SIZE=10 FILE_DEFAULT_CHUNK_SIZE=512000 # 512KB = -MONGODB_URL= -MONGODB_DATABASE= +MONGODB_URL= +MONGODB_DATABASE= diff --git a/src/models/ChunkModel.py b/src/models/ChunkModel.py index 513ef954..a4f6ca3e 100644 --- a/src/models/ChunkModel.py +++ b/src/models/ChunkModel.py @@ -10,6 +10,24 @@ def __init__(self, db_client: object): super().__init__(db_client=db_client) self.collection = self.db_client[DataBaseEnum.COLLECTION_CHUNK_NAME.value] + @classmethod + async def create_instance(cls, db_client: object): + instance = cls(db_client) + await instance.init_collection() + return instance + + async def init_collection(self): + all_collections = await self.db_client.list_collection_names() + if DataBaseEnum.COLLECTION_CHUNK_NAME.value not in all_collections: + self.collection = self.db_client[DataBaseEnum.COLLECTION_CHUNK_NAME.value] + indexes = DataChunk.get_indexes() + for index in indexes: + await self.collection.create_index( + index["key"], + name=index["name"], + unique=index["unique"] + ) + async def create_chunk(self, chunk: DataChunk): result = await self.collection.insert_one(chunk.dict(by_alias=True, exclude_unset=True)) chunk._id = result.inserted_id diff --git a/src/models/ProjectModel.py b/src/models/ProjectModel.py index 75dfa7ee..0ea1d01f 100644 --- a/src/models/ProjectModel.py +++ b/src/models/ProjectModel.py @@ -8,6 +8,25 @@ def __init__(self, db_client: object): super().__init__(db_client=db_client) self.collection = self.db_client[DataBaseEnum.COLLECTION_PROJECT_NAME.value] + @classmethod + async def create_instance(cls, db_client: object): + instance = cls(db_client) + await instance.init_collection() + return instance + + async def init_collection(self): + all_collections = await self.db_client.list_collection_names() + if DataBaseEnum.COLLECTION_PROJECT_NAME.value not in all_collections: + self.collection = self.db_client[DataBaseEnum.COLLECTION_PROJECT_NAME.value] + indexes = Project.get_indexes() + for index in indexes: + await self.collection.create_index( + index["key"], + name=index["name"], + unique=index["unique"] + ) + + async def create_project(self, project: Project): result = await self.collection.insert_one(project.dict(by_alias=True, exclude_unset=True)) diff --git a/src/models/db_schemes/data_chunk.py b/src/models/db_schemes/data_chunk.py index e7c9a7fc..c51f05d5 100644 --- a/src/models/db_schemes/data_chunk.py +++ b/src/models/db_schemes/data_chunk.py @@ -12,3 +12,14 @@ class DataChunk(BaseModel): class Config: arbitrary_types_allowed = True + @classmethod + def get_indexes(cls): + return [ + { + "key": [ + ("chunk_project_id", 1) + ], + "name": "chunk_project_id_index_1", + "unique": False + } + ] \ No newline at end of file diff --git a/src/models/db_schemes/project.py b/src/models/db_schemes/project.py index af66701a..3621abec 100644 --- a/src/models/db_schemes/project.py +++ b/src/models/db_schemes/project.py @@ -15,3 +15,16 @@ def validate_project_id(cls, value): class Config: arbitrary_types_allowed = True + + @classmethod + def get_indexes(cls): + + return [ + { + "key": [ + ("project_id", 1) + ], + "name": "project_id_index_1", + "unique": True + } + ] \ No newline at end of file diff --git a/src/routes/data.py b/src/routes/data.py index deadd4b5..47480c90 100644 --- a/src/routes/data.py +++ b/src/routes/data.py @@ -23,7 +23,7 @@ async def upload_data(request: Request, project_id: str, file: UploadFile, app_settings: Settings = Depends(get_settings)): - project_model = ProjectModel( + project_model = await ProjectModel.create_instance( db_client=request.app.db_client ) @@ -80,7 +80,7 @@ async def process_endpoint(request: Request, project_id: str, process_request: P overlap_size = process_request.overlap_size do_reset = process_request.do_reset - project_model = ProjectModel( + project_model = await ProjectModel.create_instance( db_client=request.app.db_client ) @@ -117,7 +117,7 @@ async def process_endpoint(request: Request, project_id: str, process_request: P for i, chunk in enumerate(file_chunks) ] - chunk_model = ChunkModel( + chunk_model = await ChunkModel.create_instance( db_client=request.app.db_client ) From 09573f0b9b4c1ab79e545c72dbcac834980c1ed6 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Thu, 20 Jun 2024 12:11:44 +0300 Subject: [PATCH 07/65] push assets collection --- src/.env.example | 10 ------- src/models/AssetModel.py | 43 +++++++++++++++++++++++++++++++ src/models/ProjectModel.py | 2 +- src/models/db_schemes/__init__.py | 1 + src/models/db_schemes/asset.py | 37 ++++++++++++++++++++++++++ src/models/enums/AssetTypeEnum.py | 6 +++++ src/models/enums/DataBaseEnum.py | 1 + src/routes/data.py | 20 ++++++++++++-- 8 files changed, 107 insertions(+), 13 deletions(-) create mode 100644 src/models/AssetModel.py create mode 100644 src/models/db_schemes/asset.py create mode 100644 src/models/enums/AssetTypeEnum.py diff --git a/src/.env.example b/src/.env.example index 08833a34..e69de29b 100644 --- a/src/.env.example +++ b/src/.env.example @@ -1,10 +0,0 @@ -APP_NAME="mini-RAG" -APP_VERSION="0.1" -OPENAI_API_KEY="" - = -FILE_ALLOWED_TYPES= -FILE_MAX_SIZE=10 -FILE_DEFAULT_CHUNK_SIZE=512000 # 512KB - = -MONGODB_URL= -MONGODB_DATABASE= diff --git a/src/models/AssetModel.py b/src/models/AssetModel.py new file mode 100644 index 00000000..d2d91318 --- /dev/null +++ b/src/models/AssetModel.py @@ -0,0 +1,43 @@ +from .BaseDataModel import BaseDataModel +from .db_schemes import Asset +from .enums.DataBaseEnum import DataBaseEnum +from bson import ObjectId + +class AssetModel(BaseDataModel): + + def __init__(self, db_client: object): + super().__init__(db_client=db_client) + self.collection = self.db_client[DataBaseEnum.COLLECTION_ASSET_NAME.value] + + @classmethod + async def create_instance(cls, db_client: object): + instance = cls(db_client) + await instance.init_collection() + return instance + + async def init_collection(self): + all_collections = await self.db_client.list_collection_names() + if DataBaseEnum.COLLECTION_ASSET_NAME.value not in all_collections: + self.collection = self.db_client[DataBaseEnum.COLLECTION_ASSET_NAME.value] + indexes = Asset.get_indexes() + for index in indexes: + await self.collection.create_index( + index["key"], + name=index["name"], + unique=index["unique"] + ) + + async def create_asset(self, asset: Asset): + + result = await self.collection.insert_one(asset.dict(by_alias=True, exclude_unset=True)) + asset.id = result.inserted_id + + return asset + + async def get_all_project_assets(self, asset_project_id: str): + + return await self.collection.find({ + "asset_project_id": ObjectId(asset_project_id) if isinstance(asset_project_id, str) else asset_project_id + }).to_list(length=None) + + diff --git a/src/models/ProjectModel.py b/src/models/ProjectModel.py index 0ea1d01f..94905d95 100644 --- a/src/models/ProjectModel.py +++ b/src/models/ProjectModel.py @@ -30,7 +30,7 @@ async def init_collection(self): async def create_project(self, project: Project): result = await self.collection.insert_one(project.dict(by_alias=True, exclude_unset=True)) - project._id = result.inserted_id + project.id = result.inserted_id return project diff --git a/src/models/db_schemes/__init__.py b/src/models/db_schemes/__init__.py index 6f4f9d88..a4d8e59b 100644 --- a/src/models/db_schemes/__init__.py +++ b/src/models/db_schemes/__init__.py @@ -1,2 +1,3 @@ from .project import Project from .data_chunk import DataChunk +from .asset import Asset diff --git a/src/models/db_schemes/asset.py b/src/models/db_schemes/asset.py new file mode 100644 index 00000000..cc3ca3d4 --- /dev/null +++ b/src/models/db_schemes/asset.py @@ -0,0 +1,37 @@ +from pydantic import BaseModel, Field, validator +from typing import Optional +from bson.objectid import ObjectId +from datetime import datetime + +class Asset(BaseModel): + id: Optional[ObjectId] = Field(None, alias="_id") + asset_project_id: ObjectId + asset_type: str = Field(..., min_length=1) + asset_name: str = Field(..., min_length=1) + asset_size: int = Field(ge=0, default=None) + asset_config: dict = Field(default=None) + asset_pushed_at: datetime = Field(default=datetime.utcnow) + + class Config: + arbitrary_types_allowed = True + + @classmethod + def get_indexes(cls): + + return [ + { + "key": [ + ("asset_project_id", 1) + ], + "name": "asset_project_id_index_1", + "unique": False + }, + { + "key": [ + ("asset_project_id", 1), + ("asset_name", 1) + ], + "name": "asset_project_id_name_index_1", + "unique": True + }, + ] \ No newline at end of file diff --git a/src/models/enums/AssetTypeEnum.py b/src/models/enums/AssetTypeEnum.py new file mode 100644 index 00000000..0e849ae8 --- /dev/null +++ b/src/models/enums/AssetTypeEnum.py @@ -0,0 +1,6 @@ +from enum import Enum + +class AssetTypeEnum(Enum): + + FILE = "file" + \ No newline at end of file diff --git a/src/models/enums/DataBaseEnum.py b/src/models/enums/DataBaseEnum.py index 728edfee..bb11bf71 100644 --- a/src/models/enums/DataBaseEnum.py +++ b/src/models/enums/DataBaseEnum.py @@ -4,4 +4,5 @@ class DataBaseEnum(Enum): COLLECTION_PROJECT_NAME = "projects" COLLECTION_CHUNK_NAME = "chunks" + COLLECTION_ASSET_NAME = "assets" diff --git a/src/routes/data.py b/src/routes/data.py index 47480c90..0812a951 100644 --- a/src/routes/data.py +++ b/src/routes/data.py @@ -9,7 +9,9 @@ from .schemes.data import ProcessRequest from models.ProjectModel import ProjectModel from models.ChunkModel import ChunkModel -from models.db_schemes import DataChunk +from models.AssetModel import AssetModel +from models.db_schemes import DataChunk, Asset +from models.enums.AssetTypeEnum import AssetTypeEnum logger = logging.getLogger('uvicorn.error') @@ -65,10 +67,24 @@ async def upload_data(request: Request, project_id: str, file: UploadFile, } ) + # store the assets into the database + asset_model = await AssetModel.create_instance( + db_client=request.app.db_client + ) + + asset_resource = Asset( + asset_project_id=project.id, + asset_type=AssetTypeEnum.FILE.value, + asset_name=file_id, + asset_size=os.path.getsize(file_path) + ) + + asset_record = await asset_model.create_asset(asset=asset_resource) + return JSONResponse( content={ "signal": ResponseSignal.FILE_UPLOAD_SUCCESS.value, - "file_id": file_id, + "file_id": str(asset_record.id), } ) From b5d7c29e659b5e09603021265d34911d0b390f69 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Thu, 20 Jun 2024 12:12:14 +0300 Subject: [PATCH 08/65] change .env.example --- src/.env.example | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/.env.example b/src/.env.example index e69de29b..ea4b9d2b 100644 --- a/src/.env.example +++ b/src/.env.example @@ -0,0 +1,10 @@ +APP_NAME="mini-RAG" +APP_VERSION="0.1" +OPENAI_API_KEY="" + +FILE_ALLOWED_TYPES= +FILE_MAX_SIZE=10 +FILE_DEFAULT_CHUNK_SIZE=512000 # 512KB + +MONGODB_URL= +MONGODB_DATABASE= From e1b17c8dc4abd5673e98b23a810b1d9dd86c95d8 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Tue, 2 Jul 2024 18:12:19 +0300 Subject: [PATCH 09/65] update file processing endpoint --- src/controllers/ProcessController.py | 8 ++- src/models/AssetModel.py | 25 ++++++- src/models/db_schemes/data_chunk.py | 1 + src/models/enums/ResponseEnums.py | 2 + src/routes/data.py | 103 ++++++++++++++++++++------- src/routes/schemes/data.py | 2 +- 6 files changed, 112 insertions(+), 29 deletions(-) diff --git a/src/controllers/ProcessController.py b/src/controllers/ProcessController.py index 310934e4..7c2eef41 100644 --- a/src/controllers/ProcessController.py +++ b/src/controllers/ProcessController.py @@ -25,6 +25,9 @@ def get_file_loader(self, file_id: str): file_id ) + if not os.path.exists(file_path): + return None + if file_ext == ProcessingEnum.TXT.value: return TextLoader(file_path, encoding="utf-8") @@ -36,7 +39,10 @@ def get_file_loader(self, file_id: str): def get_file_content(self, file_id: str): loader = self.get_file_loader(file_id=file_id) - return loader.load() + if loader: + return loader.load() + + return None def process_file_content(self, file_content: list, file_id: str, chunk_size: int=100, overlap_size: int=20): diff --git a/src/models/AssetModel.py b/src/models/AssetModel.py index d2d91318..2185f321 100644 --- a/src/models/AssetModel.py +++ b/src/models/AssetModel.py @@ -34,10 +34,29 @@ async def create_asset(self, asset: Asset): return asset - async def get_all_project_assets(self, asset_project_id: str): + async def get_all_project_assets(self, asset_project_id: str, asset_type: str): - return await self.collection.find({ - "asset_project_id": ObjectId(asset_project_id) if isinstance(asset_project_id, str) else asset_project_id + records = await self.collection.find({ + "asset_project_id": ObjectId(asset_project_id) if isinstance(asset_project_id, str) else asset_project_id, + "asset_type": asset_type, }).to_list(length=None) + return [ + Asset(**record) + for record in records + ] + + async def get_asset_record(self, asset_project_id: str, asset_name: str): + + record = await self.collection.find_one({ + "asset_project_id": ObjectId(asset_project_id) if isinstance(asset_project_id, str) else asset_project_id, + "asset_name": asset_name, + }) + + if record: + return Asset(**record) + + return None + + diff --git a/src/models/db_schemes/data_chunk.py b/src/models/db_schemes/data_chunk.py index c51f05d5..91c0837d 100644 --- a/src/models/db_schemes/data_chunk.py +++ b/src/models/db_schemes/data_chunk.py @@ -8,6 +8,7 @@ class DataChunk(BaseModel): chunk_metadata: dict chunk_order: int = Field(..., gt=0) chunk_project_id: ObjectId + chunk_asset_id: ObjectId class Config: arbitrary_types_allowed = True diff --git a/src/models/enums/ResponseEnums.py b/src/models/enums/ResponseEnums.py index 28aee71e..442dbd3d 100644 --- a/src/models/enums/ResponseEnums.py +++ b/src/models/enums/ResponseEnums.py @@ -9,3 +9,5 @@ class ResponseSignal(Enum): FILE_UPLOAD_FAILED = "file_upload_failed" PROCESSING_SUCCESS = "processing_success" PROCESSING_FAILED = "processing_failed" + NO_FILES_ERROR = "not_found_files" + FILE_ID_ERROR = "no_file_found_with_this_id" diff --git a/src/routes/data.py b/src/routes/data.py index 0812a951..ea49a178 100644 --- a/src/routes/data.py +++ b/src/routes/data.py @@ -91,7 +91,6 @@ async def upload_data(request: Request, project_id: str, file: UploadFile, @data_router.post("/process/{project_id}") async def process_endpoint(request: Request, project_id: str, process_request: ProcessRequest): - file_id = process_request.file_id chunk_size = process_request.chunk_size overlap_size = process_request.overlap_size do_reset = process_request.do_reset @@ -104,49 +103,105 @@ async def process_endpoint(request: Request, project_id: str, process_request: P project_id=project_id ) - process_controller = ProcessController(project_id=project_id) + asset_model = await AssetModel.create_instance( + db_client=request.app.db_client + ) - file_content = process_controller.get_file_content(file_id=file_id) + project_files_ids = {} + if process_request.file_id: + asset_record = await asset_model.get_asset_record( + asset_project_id=project.id, + asset_name=process_request.file_id + ) - file_chunks = process_controller.process_file_content( - file_content=file_content, - file_id=file_id, - chunk_size=chunk_size, - overlap_size=overlap_size - ) + if asset_record is None: + return JSONResponse( + status_code=status.HTTP_400_BAD_REQUEST, + content={ + "signal": ResponseSignal.FILE_ID_ERROR.value, + } + ) + + project_files_ids = { + asset_record.id: asset_record.asset_name + } + + else: + - if file_chunks is None or len(file_chunks) == 0: + project_files = await asset_model.get_all_project_assets( + asset_project_id=project.id, + asset_type=AssetTypeEnum.FILE.value, + ) + + project_files_ids = { + record.id: record.asset_name + for record in project_files + } + + if len(project_files_ids) == 0: return JSONResponse( status_code=status.HTTP_400_BAD_REQUEST, content={ - "signal": ResponseSignal.PROCESSING_FAILED.value + "signal": ResponseSignal.NO_FILES_ERROR.value, } ) + + process_controller = ProcessController(project_id=project_id) - file_chunks_records = [ - DataChunk( - chunk_text=chunk.page_content, - chunk_metadata=chunk.metadata, - chunk_order=i+1, - chunk_project_id=project.id, - ) - for i, chunk in enumerate(file_chunks) - ] + no_records = 0 + no_files = 0 chunk_model = await ChunkModel.create_instance( - db_client=request.app.db_client - ) + db_client=request.app.db_client + ) if do_reset == 1: _ = await chunk_model.delete_chunks_by_project_id( project_id=project.id ) - no_records = await chunk_model.insert_many_chunks(chunks=file_chunks_records) + for asset_id, file_id in project_files_ids.items(): + + file_content = process_controller.get_file_content(file_id=file_id) + + if file_content is None: + logger.error(f"Error while processing file: {file_id}") + continue + + file_chunks = process_controller.process_file_content( + file_content=file_content, + file_id=file_id, + chunk_size=chunk_size, + overlap_size=overlap_size + ) + + if file_chunks is None or len(file_chunks) == 0: + return JSONResponse( + status_code=status.HTTP_400_BAD_REQUEST, + content={ + "signal": ResponseSignal.PROCESSING_FAILED.value + } + ) + + file_chunks_records = [ + DataChunk( + chunk_text=chunk.page_content, + chunk_metadata=chunk.metadata, + chunk_order=i+1, + chunk_project_id=project.id, + chunk_asset_id=asset_id + ) + for i, chunk in enumerate(file_chunks) + ] + + no_records += await chunk_model.insert_many_chunks(chunks=file_chunks_records) + no_files += 1 return JSONResponse( content={ "signal": ResponseSignal.PROCESSING_SUCCESS.value, - "inserted_chunks": no_records + "inserted_chunks": no_records, + "processed_files": no_files } ) diff --git a/src/routes/schemes/data.py b/src/routes/schemes/data.py index a2fee336..2d72068b 100644 --- a/src/routes/schemes/data.py +++ b/src/routes/schemes/data.py @@ -2,7 +2,7 @@ from typing import Optional class ProcessRequest(BaseModel): - file_id: str + file_id: str = None chunk_size: Optional[int] = 100 overlap_size: Optional[int] = 20 do_reset: Optional[int] = 0 From 7ba5a692f377f706327c737099d0ff74e68ea1d5 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Sun, 14 Jul 2024 11:12:49 +0300 Subject: [PATCH 10/65] update requirements with openai --- src/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/requirements.txt b/src/requirements.txt index 0cca06b9..30599a56 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -8,3 +8,4 @@ langchain==0.1.20 PyMuPDF==1.24.3 motor==3.4.0 pydantic-mongo==2.3.0 +openai==1.35.13 From e88430330d27281958834525ea11c474d3b8e9b4 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Sun, 14 Jul 2024 11:13:00 +0300 Subject: [PATCH 11/65] create the LLMInterface --- src/stores/LLMInterface.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 src/stores/LLMInterface.py diff --git a/src/stores/LLMInterface.py b/src/stores/LLMInterface.py new file mode 100644 index 00000000..a86ebde5 --- /dev/null +++ b/src/stores/LLMInterface.py @@ -0,0 +1,24 @@ +from abc import ABC, abstractmethod + +class LLMInterface(ABC): + + @abstractmethod + def set_generation_model(self, model_id: str): + pass + + @abstractmethod + def set_embedding_model(self, model_id: str, embedding_size: int): + pass + + @abstractmethod + def generate_text(self, prompt: str, chat_history: list=[], max_output_tokens: int=None, + temperature: float = None): + pass + + @abstractmethod + def embed_text(self, text: str, document_type: str = None): + pass + + @abstractmethod + def construct_prompt(self, prompt: str, role: str): + pass From 05b3ffa4aa810e8c837d3aa3d5f9a437feea78f2 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Sun, 14 Jul 2024 11:13:11 +0300 Subject: [PATCH 12/65] create the LLMEnums --- src/stores/LLMEnums.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 src/stores/LLMEnums.py diff --git a/src/stores/LLMEnums.py b/src/stores/LLMEnums.py new file mode 100644 index 00000000..75ff7a65 --- /dev/null +++ b/src/stores/LLMEnums.py @@ -0,0 +1,11 @@ +from enum import Enum + +class LLMEnums(Enum): + OPENAI = "OPENAI" + COHERE = "COHERE" + +class OpenAIEnums(Enum): + SYSTEM = "system" + USER = "user" + ASSISTANT = "assistant" + From b8abe8ac5e5d9186ddbcf6c21630a9fe9012818e Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Sun, 14 Jul 2024 11:13:27 +0300 Subject: [PATCH 13/65] create thhe OpenAIProvider --- src/stores/llm/__init__.py | 0 src/stores/llm/providers/OpenAIProvider.py | 104 +++++++++++++++++++++ src/stores/llm/providers/__init__.py | 0 3 files changed, 104 insertions(+) create mode 100644 src/stores/llm/__init__.py create mode 100644 src/stores/llm/providers/OpenAIProvider.py create mode 100644 src/stores/llm/providers/__init__.py diff --git a/src/stores/llm/__init__.py b/src/stores/llm/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/stores/llm/providers/OpenAIProvider.py b/src/stores/llm/providers/OpenAIProvider.py new file mode 100644 index 00000000..95a0758d --- /dev/null +++ b/src/stores/llm/providers/OpenAIProvider.py @@ -0,0 +1,104 @@ +from ..LLMInterface import LLMInterface +from ..LLMEnums import OpenAIEnums +from openai import OpenAI +import logging + +class OpenAIProvider(LLMInterface): + + def __init__(self, api_key: str, api_url: str=None, + default_input_max_characters: int=1000, + default_generation_max_output_tokens: int=1000, + default_generation_temperature: float=0.1): + + self.api_key = api_key + self.api_url = api_url + + self.default_input_max_characters = default_input_max_characters + self.default_generation_max_output_tokens = default_generation_max_output_tokens + self.default_generation_temperature = default_generation_temperature + + self.generation_model_id = None + + self.embedding_model_id = None + self.embedding_size = None + + self.client = OpenAI( + api_key = self.api_key, + api_url = self.api_url + ) + + self.logger = logging.getLogger(__name__) + + def set_generation_model(self, model_id: str): + self.generation_model_id = model_id + + def set_embedding_model(self, model_id: str, embedding_size: int): + self.embedding_model_id = model_id + self.embedding_size = embedding_size + + def process_text(self, text: str): + return text[:self.default_input_max_characters].strip() + + def generate_text(self, prompt: str, chat_history: list=[], max_output_tokens: int=None, + temperature: float = None): + + if not self.client: + self.logger.error("OpenAI client was not set") + return None + + if not self.generation_model_id: + self.logger.error("Generation model for OpenAI was not set") + return None + + max_output_tokens = max_output_tokens if max_output_tokens else self.default_generation_max_output_tokens + temperature = temperature if temperature else self.default_generation_temperature + + chat_history.append( + self.construct_prompt(prompt=prompt, role=OpenAIEnums.USER.value) + ) + + response = self.client.chat.completions.create( + model = self.generation_model_id, + messages = chat_history, + max_tokens = max_output_tokens, + temperature = temperature + ) + + if not response or not response.choices or len(response.choices) == 0 or not response.choices[0].message: + self.logger.error("Error while generating text with OpenAI") + return None + + return response.choices[0].message["content"] + + + def embed_text(self, text: str, document_type: str = None): + + if not self.client: + self.logger.error("OpenAI client was not set") + return None + + if not self.embedding_model_id: + self.logger.error("Embedding model for OpenAI was not set") + return None + + response = self.client.embeddings.create( + model = self.embedding_model_id, + input = text, + ) + + if not response or not response.data or len(response.data) == 0 or not response.data[0].embedding: + self.logger.error("Error while embedding text with OpenAI") + return None + + return response.data[0].embedding + + def construct_prompt(self, prompt: str, role: str): + return { + "role": role, + "content": self.process_text(prompt) + } + + + + + diff --git a/src/stores/llm/providers/__init__.py b/src/stores/llm/providers/__init__.py new file mode 100644 index 00000000..e69de29b From 117667803791c84ca947ee2211fd4a07732adf44 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Sun, 14 Jul 2024 13:03:58 +0300 Subject: [PATCH 14/65] add cohere to requirments.txt --- src/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/requirements.txt b/src/requirements.txt index 30599a56..087a781d 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -9,3 +9,4 @@ PyMuPDF==1.24.3 motor==3.4.0 pydantic-mongo==2.3.0 openai==1.35.13 +cohere==5.5.8 From 2c4200b182f80905284c2c27ffc727027cd09150 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Sun, 14 Jul 2024 13:04:11 +0300 Subject: [PATCH 15/65] update .env.example --- src/.env.example | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/src/.env.example b/src/.env.example index ea4b9d2b..e54e274d 100644 --- a/src/.env.example +++ b/src/.env.example @@ -1,10 +1,26 @@ -APP_NAME="mini-RAG" -APP_VERSION="0.1" -OPENAI_API_KEY="" - -FILE_ALLOWED_TYPES= -FILE_MAX_SIZE=10 -FILE_DEFAULT_CHUNK_SIZE=512000 # 512KB - -MONGODB_URL= -MONGODB_DATABASE= +APP_NAME="mini-RAG" +APP_VERSION="0.1" +OPENAI_API_KEY="" + = +FILE_ALLOWED_TYPES=["text/plain", "application/pdf"] +FILE_MAX_SIZE=10 +FILE_DEFAULT_CHUNK_SIZE=512000 # 512KB + = +MONGODB_URL="mongodb://admin:admin@localhost:27007" +MONGODB_DATABASE="mini-rag" + = +# ========================= LLM Config ========================= +GENERATION_BACKEND = "OPENAI" +EMBEDDING_BACKEND = "COHERE" + = +OPENAI_API_KEY="" +OPENAI_API_URL= +COHERE_API_KEY="" + = +GENERATION_MODEL_ID="gpt-3.5-turbo-0125" +EMBEDDING_MODEL_ID="embed-multilingual-light-v3.0" +EMBEDDING_MODEL_SIZE=384 + = +INPUT_DAFAULT_MAX_CHARACTERS=1024 +GENERATION_DAFAULT_MAX_TOKENS=200 +GENERATION_DAFAULT_TEMPERATURE=0.1 From 7f3d5941ae0eb4fb54eea16282ce640257a6538e Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Sun, 14 Jul 2024 13:04:19 +0300 Subject: [PATCH 16/65] update settings --- .vscode/settings.json | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..99ff45e0 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.languageServer": "Pylance" +} \ No newline at end of file From ffcd64cc788ea9f71e8a91799bc12a64230409b6 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Sun, 14 Jul 2024 13:04:42 +0300 Subject: [PATCH 17/65] fix files location --- src/stores/LLMEnums.py | 11 ----------- src/stores/llm/LLMEnums.py | 23 +++++++++++++++++++++++ src/stores/{ => llm}/LLMInterface.py | 0 src/stores/llm/providers/__init__.py | 2 ++ 4 files changed, 25 insertions(+), 11 deletions(-) delete mode 100644 src/stores/LLMEnums.py create mode 100644 src/stores/llm/LLMEnums.py rename src/stores/{ => llm}/LLMInterface.py (100%) diff --git a/src/stores/LLMEnums.py b/src/stores/LLMEnums.py deleted file mode 100644 index 75ff7a65..00000000 --- a/src/stores/LLMEnums.py +++ /dev/null @@ -1,11 +0,0 @@ -from enum import Enum - -class LLMEnums(Enum): - OPENAI = "OPENAI" - COHERE = "COHERE" - -class OpenAIEnums(Enum): - SYSTEM = "system" - USER = "user" - ASSISTANT = "assistant" - diff --git a/src/stores/llm/LLMEnums.py b/src/stores/llm/LLMEnums.py new file mode 100644 index 00000000..bd7c8cfc --- /dev/null +++ b/src/stores/llm/LLMEnums.py @@ -0,0 +1,23 @@ +from enum import Enum + +class LLMEnums(Enum): + OPENAI = "OPENAI" + COHERE = "COHERE" + +class OpenAIEnums(Enum): + SYSTEM = "system" + USER = "user" + ASSISTANT = "assistant" + +class CoHereEnums(Enum): + SYSTEM = "SYSTEM" + USER = "USER" + ASSISTANT = "CHATBOT" + + DOCUMENT = "search_document" + QUERY = "search_query" + + +class DocumentTypeEnum(Enum): + DOCUMENT = "document" + QUERY = "query" \ No newline at end of file diff --git a/src/stores/LLMInterface.py b/src/stores/llm/LLMInterface.py similarity index 100% rename from src/stores/LLMInterface.py rename to src/stores/llm/LLMInterface.py diff --git a/src/stores/llm/providers/__init__.py b/src/stores/llm/providers/__init__.py index e69de29b..3368d766 100644 --- a/src/stores/llm/providers/__init__.py +++ b/src/stores/llm/providers/__init__.py @@ -0,0 +1,2 @@ +from .CoHereProvider import CoHereProvider +from .OpenAIProvider import OpenAIProvider From 87efbbd965ce521707d075ad1ebb318b476ed86c Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Sun, 14 Jul 2024 13:05:18 +0300 Subject: [PATCH 18/65] set CoHereProvider Class --- src/stores/llm/providers/CoHereProvider.py | 96 ++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 src/stores/llm/providers/CoHereProvider.py diff --git a/src/stores/llm/providers/CoHereProvider.py b/src/stores/llm/providers/CoHereProvider.py new file mode 100644 index 00000000..8e52e6e9 --- /dev/null +++ b/src/stores/llm/providers/CoHereProvider.py @@ -0,0 +1,96 @@ +from ..LLMInterface import LLMInterface +from ..LLMEnums import CoHereEnums, DocumentTypeEnum +import cohere +import logging + +class CoHereProvider(LLMInterface): + + def __init__(self, api_key: str, + default_input_max_characters: int=1000, + default_generation_max_output_tokens: int=1000, + default_generation_temperature: float=0.1): + + self.api_key = api_key + + self.default_input_max_characters = default_input_max_characters + self.default_generation_max_output_tokens = default_generation_max_output_tokens + self.default_generation_temperature = default_generation_temperature + + self.generation_model_id = None + + self.embedding_model_id = None + self.embedding_size = None + + self.client = cohere.Client(api_key=self.api_key) + + self.logger = logging.getLogger(__name__) + + def set_generation_model(self, model_id: str): + self.generation_model_id = model_id + + def set_embedding_model(self, model_id: str, embedding_size: int): + self.embedding_model_id = model_id + self.embedding_size = embedding_size + + def process_text(self, text: str): + return text[:self.default_input_max_characters].strip() + + def generate_text(self, prompt: str, chat_history: list=[], max_output_tokens: int=None, + temperature: float = None): + + if not self.client: + self.logger.error("CoHere client was not set") + return None + + if not self.generation_model_id: + self.logger.error("Generation model for CoHere was not set") + return None + + max_output_tokens = max_output_tokens if max_output_tokens else self.default_generation_max_output_tokens + temperature = temperature if temperature else self.default_generation_temperature + + response = self.client.chat( + model = self.generation_model_id, + chat_history = chat_history, + message = self.process_text(prompt), + temperature = temperature, + max_tokens = max_output_tokens + ) + + if not response or not response.text: + self.logger.error("Error while generating text with CoHere") + return None + + return response.text + + def embed_text(self, text: str, document_type: str = None): + if not self.client: + self.logger.error("CoHere client was not set") + return None + + if not self.embedding_model_id: + self.logger.error("Embedding model for CoHere was not set") + return None + + input_type = CoHereEnums.DOCUMENT + if document_type == DocumentTypeEnum.QUERY: + input_type = CoHereEnums.QUERY + + response = self.client.embed( + model = self.embedding_model_id, + texts = [self.process_text(text)], + input_type = input_type, + embedding_types=['float'], + ) + + if not response or not response.embeddings or not response.embeddings.float: + self.logger.error("Error while embedding text with CoHere") + return None + + return response.embeddings.float[0] + + def construct_prompt(self, prompt: str, role: str): + return { + "role": role, + "text": self.process_text(prompt) + } \ No newline at end of file From 237b543f53a0840b720030fa765232de82343ae7 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Sun, 14 Jul 2024 13:05:33 +0300 Subject: [PATCH 19/65] setup LLMProviderFactory --- src/stores/llm/LLMProviderFactory.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 src/stores/llm/LLMProviderFactory.py diff --git a/src/stores/llm/LLMProviderFactory.py b/src/stores/llm/LLMProviderFactory.py new file mode 100644 index 00000000..2cd25392 --- /dev/null +++ b/src/stores/llm/LLMProviderFactory.py @@ -0,0 +1,27 @@ + +from .LLMEnums import LLMEnums +from .providers import OpenAIProvider, CoHereProvider + +class LLMProviderFactory: + def __init__(self, config: dict): + self.config = config + + def create(self, provider: str): + if provider == LLMEnums.OPENAI.value: + return OpenAIProvider( + api_key = self.config.OPENAI_API_KEY, + api_url = self.config.OPENAI_API_URL, + default_input_max_characters=self.config.INPUT_DAFAULT_MAX_CHARACTERS, + default_generation_max_output_tokens=self.config.GENERATION_DAFAULT_MAX_TOKENS, + default_generation_temperature=self.config.GENERATION_DAFAULT_TEMPERATURE + ) + + if provider == LLMEnums.COHERE.value: + return CoHereProvider( + api_key = self.config.COHERE_API_KEY, + default_input_max_characters=self.config.INPUT_DAFAULT_MAX_CHARACTERS, + default_generation_max_output_tokens=self.config.GENERATION_DAFAULT_MAX_TOKENS, + default_generation_temperature=self.config.GENERATION_DAFAULT_TEMPERATURE + ) + + return None From 97ae1c2dcac1e92299e69115b930532848cb72e8 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Sun, 14 Jul 2024 13:05:43 +0300 Subject: [PATCH 20/65] update main.py --- src/helpers/config.py | 14 ++++++++++++++ src/main.py | 16 ++++++++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/helpers/config.py b/src/helpers/config.py index 3d8ea7fc..cc23f1ac 100644 --- a/src/helpers/config.py +++ b/src/helpers/config.py @@ -13,6 +13,20 @@ class Settings(BaseSettings): MONGODB_URL: str MONGODB_DATABASE: str + GENERATION_BACKEND: str + EMBEDDING_BACKEND: str + + OPENAI_API_KEY: str = None + OPENAI_API_URL: str = None + COHERE_API_KEY: str = None + + GENERATION_MODEL_ID: str = None + EMBEDDING_MODEL_ID: str = None + EMBEDDING_MODEL_SIZE: int = None + INPUT_DAFAULT_MAX_CHARACTERS: int = None + GENERATION_DAFAULT_MAX_TOKENS: int = None + GENERATION_DAFAULT_TEMPERATURE: float = None + class Config: env_file = ".env" diff --git a/src/main.py b/src/main.py index 168b3265..e3f77ff2 100644 --- a/src/main.py +++ b/src/main.py @@ -2,19 +2,31 @@ from routes import base, data from motor.motor_asyncio import AsyncIOMotorClient from helpers.config import get_settings +from stores.llm.LLMProviderFactory import LLMProviderFactory app = FastAPI() -@app.on_event("startup") async def startup_db_client(): settings = get_settings() app.mongo_conn = AsyncIOMotorClient(settings.MONGODB_URL) app.db_client = app.mongo_conn[settings.MONGODB_DATABASE] -@app.on_event("shutdown") + llm_provider_factory = LLMProviderFactory(settings) + + # generation client + app.generation_client = llm_provider_factory.create(provider=settings.GENERATION_BACKEND) + app.generation_client.set_generation_model(model_id = settings.GENERATION_MODEL_ID) + + # embedding client + app.embedding_client = llm_provider_factory.create(provider=settings.EMBEDDING_BACKEND) + app.embedding_client.set_embedding_model(model_id=settings.EMBEDDING_MODEL_ID, + embedding_size=settings.EMBEDDING_MODEL_SIZE) + async def shutdown_db_client(): app.mongo_conn.close() +app.router.lifespan.on_startup.append(startup_db_client) +app.router.lifespan.on_shutdown.append(shutdown_db_client) app.include_router(base.base_router) app.include_router(data.data_router) From 6697f0dfa34ac4176c59c9fdc1bd52bbbbde46d4 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Fri, 19 Jul 2024 23:39:57 +0300 Subject: [PATCH 21/65] VectorDB - Qdrant Provider --- src/.env.example | 57 ++++---- src/assets/.gitignore | 1 + src/controllers/BaseController.py | 16 +++ src/helpers/config.py | 4 + src/requirements.txt | 1 + src/stores/vectordb/VectorDBEnums.py | 8 ++ src/stores/vectordb/VectorDBInterface.py | 51 +++++++ .../vectordb/VectorDBProviderFactory.py | 19 +++ src/stores/vectordb/__init__.py | 0 .../vectordb/providers/QdrantDBProvider.py | 132 ++++++++++++++++++ src/stores/vectordb/providers/__init__.py | 1 + 11 files changed, 264 insertions(+), 26 deletions(-) create mode 100644 src/stores/vectordb/VectorDBEnums.py create mode 100644 src/stores/vectordb/VectorDBInterface.py create mode 100644 src/stores/vectordb/VectorDBProviderFactory.py create mode 100644 src/stores/vectordb/__init__.py create mode 100644 src/stores/vectordb/providers/QdrantDBProvider.py create mode 100644 src/stores/vectordb/providers/__init__.py diff --git a/src/.env.example b/src/.env.example index e54e274d..74416636 100644 --- a/src/.env.example +++ b/src/.env.example @@ -1,26 +1,31 @@ -APP_NAME="mini-RAG" -APP_VERSION="0.1" -OPENAI_API_KEY="" - = -FILE_ALLOWED_TYPES=["text/plain", "application/pdf"] -FILE_MAX_SIZE=10 -FILE_DEFAULT_CHUNK_SIZE=512000 # 512KB - = -MONGODB_URL="mongodb://admin:admin@localhost:27007" -MONGODB_DATABASE="mini-rag" - = -# ========================= LLM Config ========================= -GENERATION_BACKEND = "OPENAI" -EMBEDDING_BACKEND = "COHERE" - = -OPENAI_API_KEY="" -OPENAI_API_URL= -COHERE_API_KEY="" - = -GENERATION_MODEL_ID="gpt-3.5-turbo-0125" -EMBEDDING_MODEL_ID="embed-multilingual-light-v3.0" -EMBEDDING_MODEL_SIZE=384 - = -INPUT_DAFAULT_MAX_CHARACTERS=1024 -GENERATION_DAFAULT_MAX_TOKENS=200 -GENERATION_DAFAULT_TEMPERATURE=0.1 +APP_NAME="mini-RAG" +APP_VERSION="0.1" +OPENAI_API_KEY="" + +FILE_ALLOWED_TYPES=["text/plain", "application/pdf"] +FILE_MAX_SIZE=10 +FILE_DEFAULT_CHUNK_SIZE=512000 # 512KB + +MONGODB_URL="mongodb://admin:admin@localhost:27007" +MONGODB_DATABASE="mini-rag" + +# ========================= LLM Config ========================= +GENERATION_BACKEND="OPENAI" +EMBEDDING_BACKEND="COHERE" + +OPENAI_API_KEY="" +OPENAI_API_URL= +COHERE_API_KEY="" + +GENERATION_MODEL_ID="gpt-3.5-turbo-0125" +EMBEDDING_MODEL_ID="embed-multilingual-light-v3.0" +EMBEDDING_MODEL_SIZE=384 + +INPUT_DAFAULT_MAX_CHARACTERS=1024 +GENERATION_DAFAULT_MAX_TOKENS=200 +GENERATION_DAFAULT_TEMPERATURE=0.1 + +# ========================= Vector DB Config ========================= +VECTOR_DB_BACKEND="QDRANT" +VECTOR_DB_PATH="qdrant_db" +VECTOR_DB_DISTANCE_METHOD="cosine" diff --git a/src/assets/.gitignore b/src/assets/.gitignore index 027271b9..ac32e2b6 100644 --- a/src/assets/.gitignore +++ b/src/assets/.gitignore @@ -1 +1,2 @@ files +database diff --git a/src/controllers/BaseController.py b/src/controllers/BaseController.py index 78554674..aa3e573d 100644 --- a/src/controllers/BaseController.py +++ b/src/controllers/BaseController.py @@ -14,6 +14,22 @@ def __init__(self): self.base_dir, "assets/files" ) + + self.database_dir = os.path.join( + self.base_dir, + "assets/database" + ) def generate_random_string(self, length: int=12): return ''.join(random.choices(string.ascii_lowercase + string.digits, k=length)) + + def get_database_path(self, db_name: str): + + database_path = os.path.join( + self.database_dir, db_name + ) + + if not os.path.exists(database_path): + os.makedirs(database_path) + + return database_path \ No newline at end of file diff --git a/src/helpers/config.py b/src/helpers/config.py index cc23f1ac..25d2091d 100644 --- a/src/helpers/config.py +++ b/src/helpers/config.py @@ -27,6 +27,10 @@ class Settings(BaseSettings): GENERATION_DAFAULT_MAX_TOKENS: int = None GENERATION_DAFAULT_TEMPERATURE: float = None + VECTOR_DB_BACKEND : str + VECTOR_DB_PATH : str + VECTOR_DB_DISTANCE_METHOD: str = None + class Config: env_file = ".env" diff --git a/src/requirements.txt b/src/requirements.txt index 087a781d..8ac2317d 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -10,3 +10,4 @@ motor==3.4.0 pydantic-mongo==2.3.0 openai==1.35.13 cohere==5.5.8 +qdrant-client==1.10.1 diff --git a/src/stores/vectordb/VectorDBEnums.py b/src/stores/vectordb/VectorDBEnums.py new file mode 100644 index 00000000..808a73ff --- /dev/null +++ b/src/stores/vectordb/VectorDBEnums.py @@ -0,0 +1,8 @@ +from enum import Enum + +class VectorDBEnums(Enum): + QDRANT = "QDRANT" + +class DistanceMethodEnums(Enum): + COSINE = "cosine" + DOT = "dot" diff --git a/src/stores/vectordb/VectorDBInterface.py b/src/stores/vectordb/VectorDBInterface.py new file mode 100644 index 00000000..ee44d47e --- /dev/null +++ b/src/stores/vectordb/VectorDBInterface.py @@ -0,0 +1,51 @@ +from abc import ABC, abstractmethod +from typing import List + +class VectorDBInterface(ABC): + + @abstractmethod + def connect(self): + pass + + @abstractmethod + def disconnect(self): + pass + + @abstractmethod + def is_collection_existed(self, collection_name: str) -> bool: + pass + + @abstractmethod + def list_all_collections(self) -> List: + pass + + @abstractmethod + def get_collection_info(self, collection_name: str) -> dict: + pass + + @abstractmethod + def delete_collection(self, collection_name: str): + pass + + @abstractmethod + def create_collection(self, collection_name: str, + embedding_size: int, + do_reset: bool = False): + pass + + @abstractmethod + def insert_one(self, collection_name: str, text: str, vector: list, + metadata: dict = None, + record_id: str = None): + pass + + @abstractmethod + def insert_many(self, collection_name: str, texts: list, + vectors: list, metadata: list = None, + record_ids: list = None, batch_size: int = 50): + pass + + @abstractmethod + def search_by_vector(self, collection_name: str, vector: list, limit: int): + pass + \ No newline at end of file diff --git a/src/stores/vectordb/VectorDBProviderFactory.py b/src/stores/vectordb/VectorDBProviderFactory.py new file mode 100644 index 00000000..df2ac486 --- /dev/null +++ b/src/stores/vectordb/VectorDBProviderFactory.py @@ -0,0 +1,19 @@ +from .providers import QdrantDBProvider +from .VectorDBEnums import VectorDBEnums +from controllers.BaseController import BaseController + +class VectorDBProviderFactory: + def __init__(self, config): + self.config = config + self.base_controller = BaseController() + + def create(self, provider: str): + if provider == VectorDBEnums.QDRANT.value: + db_path = self.base_controller.get_database_path(db_name=self.config.VECTOR_DB_PATH) + + return QdrantDBProvider( + db_path=db_path, + distance_method=self.config.VECTOR_DB_DISTANCE_METHOD, + ) + + return None diff --git a/src/stores/vectordb/__init__.py b/src/stores/vectordb/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/stores/vectordb/providers/QdrantDBProvider.py b/src/stores/vectordb/providers/QdrantDBProvider.py new file mode 100644 index 00000000..f0609787 --- /dev/null +++ b/src/stores/vectordb/providers/QdrantDBProvider.py @@ -0,0 +1,132 @@ +from qdrant_client import models, QdrantClient +from ..VectorDBInterface import VectorDBInterface +from ..VectorDBEnums import DistanceMethodEnums +import logging +from typing import List + +class QdrantDBProvider(VectorDBInterface): + + def __init__(self, db_path: str, distance_method: str): + + self.client = None + self.db_path = db_path + self.distance_method = None + + if distance_method == DistanceMethodEnums.COSINE.value: + self.distance_method = models.Distance.COSINE + elif distance_method == DistanceMethodEnums.DOT.value: + self.distance_method = models.Distance.DOT + + self.logger = logging.getLogger(__name__) + + def connect(self): + self.client = QdrantClient(path=self.db_path) + + def disconnect(self): + self.client = None + + def is_collection_existed(self, collection_name: str) -> bool: + return self.client.collection_exists(collection_name=collection_name) + + def list_all_collections(self) -> List: + return self.client.get_collections() + + def get_collection_info(self, collection_name: str) -> dict: + return self.client.get_collection(collection_name=collection_name) + + def delete_collection(self, collection_name: str): + if self.is_collection_existed(collection_name): + return self.client.delete_collection(collection_name=collection_name) + + def create_collection(self, collection_name: str, + embedding_size: int, + do_reset: bool = False): + if do_reset: + _ = self.delete_collection(collection_name=collection_name) + + if not self.is_collection_existed(collection_name): + _ = self.client.create_collection( + collection_name=collection_name, + vectors_config=models.VectorParams( + size=embedding_size, + distance=self.distance_method + ) + ) + + return True + + return False + + def insert_one(self, collection_name: str, text: str, vector: list, + metadata: dict = None, + record_id: str = None): + + if not self.is_collection_existed(collection_name): + self.logger.error(f"Can not insert new record to non-existed collection: {collection_name}") + return False + + try: + _ = self.client.upload_records( + collection_name=collection_name, + records=[ + models.Record( + vector=vector, + payload={ + "text": text, "metadata": metadata + } + ) + ] + ) + except Exception as e: + self.logger.error(f"Error while inserting batch: {e}") + return False + + return True + + def insert_many(self, collection_name: str, texts: list, + vectors: list, metadata: list = None, + record_ids: list = None, batch_size: int = 50): + + if metadata is None: + metadata = [None] * len(texts) + + if record_ids is None: + record_ids = [None] * len(texts) + + for i in range(0, len(texts), batch_size): + batch_end = i + batch_size + + batch_texts = texts[i:batch_end] + batch_vectors = vectors[i:batch_end] + batch_metadata = metadata[i:batch_end] + + batch_records = [ + models.Record( + vector=batch_vectors[x], + payload={ + "text": batch_texts[x], "metadata": batch_metadata[x] + } + ) + + for x in range(len(batch_texts)) + ] + + try: + _ = self.client.upload_records( + collection_name=collection_name, + records=batch_records, + ) + except Exception as e: + self.logger.error(f"Error while inserting batch: {e}") + return False + + return True + + def search_by_vector(self, collection_name: str, vector: list, limit: int = 5): + + return self.client.search( + collection_name=collection_name, + query_vector=vector, + limit=limit + ) + diff --git a/src/stores/vectordb/providers/__init__.py b/src/stores/vectordb/providers/__init__.py new file mode 100644 index 00000000..139cec85 --- /dev/null +++ b/src/stores/vectordb/providers/__init__.py @@ -0,0 +1 @@ +from .QdrantDBProvider import QdrantDBProvider From 1ebd1a91f32481bc7ca5968ca2a4da70087bec60 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Sat, 20 Jul 2024 01:47:53 +0300 Subject: [PATCH 22/65] nlp routes --- src/.env.example | 10 +- src/controllers/NLPController.py | 90 +++++++++++ src/controllers/__init__.py | 2 + src/main.py | 22 ++- src/models/ChunkModel.py | 13 +- src/models/enums/ResponseEnums.py | 7 + src/routes/nlp.py | 150 ++++++++++++++++++ src/routes/schemes/nlp.py | 9 ++ .../vectordb/providers/QdrantDBProvider.py | 5 +- 9 files changed, 294 insertions(+), 14 deletions(-) create mode 100644 src/controllers/NLPController.py create mode 100644 src/routes/nlp.py create mode 100644 src/routes/schemes/nlp.py diff --git a/src/.env.example b/src/.env.example index 74416636..f7f22f67 100644 --- a/src/.env.example +++ b/src/.env.example @@ -10,8 +10,8 @@ MONGODB_URL="mongodb://admin:admin@localhost:27007" MONGODB_DATABASE="mini-rag" # ========================= LLM Config ========================= -GENERATION_BACKEND="OPENAI" -EMBEDDING_BACKEND="COHERE" +GENERATION_BACKEND = +EMBEDDING_BACKEND = OPENAI_API_KEY="" OPENAI_API_URL= @@ -26,6 +26,6 @@ GENERATION_DAFAULT_MAX_TOKENS=200 GENERATION_DAFAULT_TEMPERATURE=0.1 # ========================= Vector DB Config ========================= -VECTOR_DB_BACKEND="QDRANT" -VECTOR_DB_PATH="qdrant_db" -VECTOR_DB_DISTANCE_METHOD="cosine" +VECTOR_DB_BACKEND = +VECTOR_DB_PATH = +VECTOR_DB_DISTANCE_METHOD = diff --git a/src/controllers/NLPController.py b/src/controllers/NLPController.py new file mode 100644 index 00000000..58b859e6 --- /dev/null +++ b/src/controllers/NLPController.py @@ -0,0 +1,90 @@ +from .BaseController import BaseController +from models.db_schemes import Project, DataChunk +from stores.llm.LLMEnums import DocumentTypeEnum +from typing import List +import json + +class NLPController(BaseController): + + def __init__(self, vectordb_client, generation_client, embedding_client): + super().__init__() + + self.vectordb_client = vectordb_client + self.generation_client = generation_client + self.embedding_client = embedding_client + + def create_collection_name(self, project_id: str): + return f"collection_{project_id}".strip() + + def reset_vector_db_collection(self, project: Project): + collection_name = self.create_collection_name(project_id=project.project_id) + return self.vectordb_client.delete_collection(collection_name=collection_name) + + def get_vector_db_collection_info(self, project: Project): + collection_name = self.create_collection_name(project_id=project.project_id) + collection_info = self.vectordb_client.get_collection_info(collection_name=collection_name) + + return json.loads( + json.dumps(collection_info, default=lambda x: x.__dict__) + ) + + def index_into_vector_db(self, project: Project, chunks: List[DataChunk], + chunks_ids: List[int], + do_reset: bool = False): + + # step1: get collection name + collection_name = self.create_collection_name(project_id=project.project_id) + + # step2: manage items + texts = [ c.chunk_text for c in chunks ] + metadata = [ c.chunk_metadata for c in chunks] + vectors = [ + self.embedding_client.embed_text(text=text, + document_type=DocumentTypeEnum.DOCUMENT.value) + for text in texts + ] + + # step3: create collection if not exists + _ = self.vectordb_client.create_collection( + collection_name=collection_name, + embedding_size=self.embedding_client.embedding_size, + do_reset=do_reset, + ) + + # step4: insert into vector db + _ = self.vectordb_client.insert_many( + collection_name=collection_name, + texts=texts, + metadata=metadata, + vectors=vectors, + record_ids=chunks_ids, + ) + + return True + + def search_vector_db_collection(self, project: Project, text: str, limit: int = 10): + + # step1: get collection name + collection_name = self.create_collection_name(project_id=project.project_id) + + # step2: get text embedding vector + vector = self.embedding_client.embed_text(text=text, + document_type=DocumentTypeEnum.QUERY.value) + + if not vector or len(vector) == 0: + return False + + # step3: do semantic search + results = self.vectordb_client.search_by_vector( + collection_name=collection_name, + vector=vector, + limit=limit + ) + + if not results: + return False + + return json.loads( + json.dumps(results, default=lambda x: x.__dict__) + ) + diff --git a/src/controllers/__init__.py b/src/controllers/__init__.py index 0ca8209e..8876467a 100644 --- a/src/controllers/__init__.py +++ b/src/controllers/__init__.py @@ -1,3 +1,5 @@ from .DataController import DataController from .ProjectController import ProjectController from .ProcessController import ProcessController +from .NLPController import NLPController + diff --git a/src/main.py b/src/main.py index e3f77ff2..da010e93 100644 --- a/src/main.py +++ b/src/main.py @@ -1,17 +1,19 @@ from fastapi import FastAPI -from routes import base, data +from routes import base, data, nlp from motor.motor_asyncio import AsyncIOMotorClient from helpers.config import get_settings from stores.llm.LLMProviderFactory import LLMProviderFactory +from stores.vectordb.VectorDBProviderFactory import VectorDBProviderFactory app = FastAPI() -async def startup_db_client(): +async def startup_span(): settings = get_settings() app.mongo_conn = AsyncIOMotorClient(settings.MONGODB_URL) app.db_client = app.mongo_conn[settings.MONGODB_DATABASE] llm_provider_factory = LLMProviderFactory(settings) + vectordb_provider_factory = VectorDBProviderFactory(settings) # generation client app.generation_client = llm_provider_factory.create(provider=settings.GENERATION_BACKEND) @@ -21,13 +23,21 @@ async def startup_db_client(): app.embedding_client = llm_provider_factory.create(provider=settings.EMBEDDING_BACKEND) app.embedding_client.set_embedding_model(model_id=settings.EMBEDDING_MODEL_ID, embedding_size=settings.EMBEDDING_MODEL_SIZE) + + # vector db client + app.vectordb_client = vectordb_provider_factory.create( + provider=settings.VECTOR_DB_BACKEND + ) + app.vectordb_client.connect() -async def shutdown_db_client(): + +async def shutdown_span(): app.mongo_conn.close() + app.vectordb_client.disconnect() -app.router.lifespan.on_startup.append(startup_db_client) -app.router.lifespan.on_shutdown.append(shutdown_db_client) +app.on_event("startup")(startup_span) +app.on_event("shutdown")(shutdown_span) app.include_router(base.base_router) app.include_router(data.data_router) - +app.include_router(nlp.nlp_router) diff --git a/src/models/ChunkModel.py b/src/models/ChunkModel.py index a4f6ca3e..37cbb23a 100644 --- a/src/models/ChunkModel.py +++ b/src/models/ChunkModel.py @@ -64,6 +64,15 @@ async def delete_chunks_by_project_id(self, project_id: ObjectId): return result.deleted_count - + async def get_poject_chunks(self, project_id: ObjectId, page_no: int=1, page_size: int=50): + records = await self.collection.find({ + "chunk_project_id": project_id + }).skip( + (page_no-1) * page_size + ).limit(page_size).to_list(length=None) + + return [ + DataChunk(**record) + for record in records + ] - diff --git a/src/models/enums/ResponseEnums.py b/src/models/enums/ResponseEnums.py index 442dbd3d..c50f53d3 100644 --- a/src/models/enums/ResponseEnums.py +++ b/src/models/enums/ResponseEnums.py @@ -11,3 +11,10 @@ class ResponseSignal(Enum): PROCESSING_FAILED = "processing_failed" NO_FILES_ERROR = "not_found_files" FILE_ID_ERROR = "no_file_found_with_this_id" + PROJECT_NOT_FOUND_ERROR = "project_not_found" + INSERT_INTO_VECTORDB_ERROR = "insert_into_vectordb_error" + INSERT_INTO_VECTORDB_SUCCESS = "insert_into_vectordb_success" + VECTORDB_COLLECTION_RETRIEVED = "vectordb_collection_retrieved" + VECTORDB_SEARCH_ERROR = "vectordb_search_error" + VECTORDB_SEARCH_SUCCESS = "vectordb_search_success" + \ No newline at end of file diff --git a/src/routes/nlp.py b/src/routes/nlp.py new file mode 100644 index 00000000..acca2a60 --- /dev/null +++ b/src/routes/nlp.py @@ -0,0 +1,150 @@ +from fastapi import FastAPI, APIRouter, status, Request +from fastapi.responses import JSONResponse +from routes.schemes.nlp import PushRequest, SearchRequest +from models.ProjectModel import ProjectModel +from models.ChunkModel import ChunkModel +from controllers import NLPController +from models import ResponseSignal + +import logging + +logger = logging.getLogger('uvicorn.error') + +nlp_router = APIRouter( + prefix="/api/v1/nlp", + tags=["api_v1", "nlp"], +) + +@nlp_router.post("/index/push/{project_id}") +async def index_project(request: Request, project_id: str, push_request: PushRequest): + + project_model = await ProjectModel.create_instance( + db_client=request.app.db_client + ) + + chunk_model = await ChunkModel.create_instance( + db_client=request.app.db_client + ) + + project = await project_model.get_project_or_create_one( + project_id=project_id + ) + + if not project: + return JSONResponse( + status_code=status.HTTP_400_BAD_REQUEST, + content={ + "signal": ResponseSignal.PROJECT_NOT_FOUND_ERROR.value + } + ) + + nlp_controller = NLPController( + vectordb_client=request.app.vectordb_client, + generation_client=request.app.generation_client, + embedding_client=request.app.embedding_client, + ) + + has_records = True + page_no = 1 + inserted_items_count = 0 + idx = 0 + + while has_records: + page_chunks = await chunk_model.get_poject_chunks(project_id=project.id, page_no=page_no) + if len(page_chunks): + page_no += 1 + + if not page_chunks or len(page_chunks) == 0: + has_records = False + break + + chunks_ids = list(range(idx, idx + len(page_chunks))) + idx += len(page_chunks) + + is_inserted = nlp_controller.index_into_vector_db( + project=project, + chunks=page_chunks, + do_reset=push_request.do_reset, + chunks_ids=chunks_ids + ) + + if not is_inserted: + return JSONResponse( + status_code=status.HTTP_400_BAD_REQUEST, + content={ + "signal": ResponseSignal.INSERT_INTO_VECTORDB_ERROR.value + } + ) + + inserted_items_count += len(page_chunks) + + return JSONResponse( + content={ + "signal": ResponseSignal.INSERT_INTO_VECTORDB_SUCCESS.value, + "inserted_items_count": inserted_items_count + } + ) + +@nlp_router.get("/index/info/{project_id}") +async def get_project_index_info(request: Request, project_id: str): + + project_model = await ProjectModel.create_instance( + db_client=request.app.db_client + ) + + project = await project_model.get_project_or_create_one( + project_id=project_id + ) + + nlp_controller = NLPController( + vectordb_client=request.app.vectordb_client, + generation_client=request.app.generation_client, + embedding_client=request.app.embedding_client, + ) + + collection_info = nlp_controller.get_vector_db_collection_info(project=project) + + return JSONResponse( + content={ + "signal": ResponseSignal.VECTORDB_COLLECTION_RETRIEVED.value, + "collection_info": collection_info + } + ) + +@nlp_router.post("/index/search/{project_id}") +async def search_index(request: Request, project_id: str, search_request: SearchRequest): + + project_model = await ProjectModel.create_instance( + db_client=request.app.db_client + ) + + project = await project_model.get_project_or_create_one( + project_id=project_id + ) + + nlp_controller = NLPController( + vectordb_client=request.app.vectordb_client, + generation_client=request.app.generation_client, + embedding_client=request.app.embedding_client, + ) + + results = nlp_controller.search_vector_db_collection( + project=project, text=search_request.text, limit=search_request.limit + ) + + if not results: + return JSONResponse( + status_code=status.HTTP_400_BAD_REQUEST, + content={ + "signal": ResponseSignal.VECTORDB_SEARCH_ERROR.value + } + ) + + return JSONResponse( + content={ + "signal": ResponseSignal.VECTORDB_SEARCH_SUCCESS.value, + "results": results + } + ) + + diff --git a/src/routes/schemes/nlp.py b/src/routes/schemes/nlp.py new file mode 100644 index 00000000..57319484 --- /dev/null +++ b/src/routes/schemes/nlp.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel +from typing import Optional + +class PushRequest(BaseModel): + do_reset: Optional[int] = 0 + +class SearchRequest(BaseModel): + text: str + limit: Optional[int] = 5 diff --git a/src/stores/vectordb/providers/QdrantDBProvider.py b/src/stores/vectordb/providers/QdrantDBProvider.py index f0609787..28bf30d5 100644 --- a/src/stores/vectordb/providers/QdrantDBProvider.py +++ b/src/stores/vectordb/providers/QdrantDBProvider.py @@ -70,6 +70,7 @@ def insert_one(self, collection_name: str, text: str, vector: list, collection_name=collection_name, records=[ models.Record( + id=[record_id], vector=vector, payload={ "text": text, "metadata": metadata @@ -91,7 +92,7 @@ def insert_many(self, collection_name: str, texts: list, metadata = [None] * len(texts) if record_ids is None: - record_ids = [None] * len(texts) + record_ids = list(range(0, len(texts))) for i in range(0, len(texts), batch_size): batch_end = i + batch_size @@ -99,9 +100,11 @@ def insert_many(self, collection_name: str, texts: list, batch_texts = texts[i:batch_end] batch_vectors = vectors[i:batch_end] batch_metadata = metadata[i:batch_end] + batch_record_ids = record_ids[i:batch_end] batch_records = [ models.Record( + id=batch_record_ids[x], vector=batch_vectors[x], payload={ "text": batch_texts[x], "metadata": batch_metadata[x] From bc4767e224e910459d3e839194f5941af871b5a4 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Thu, 15 Aug 2024 10:52:04 +0300 Subject: [PATCH 23/65] update .env --- src/.env.example | 66 +++++++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/src/.env.example b/src/.env.example index f7f22f67..1d1336ac 100644 --- a/src/.env.example +++ b/src/.env.example @@ -1,31 +1,35 @@ -APP_NAME="mini-RAG" -APP_VERSION="0.1" -OPENAI_API_KEY="" - -FILE_ALLOWED_TYPES=["text/plain", "application/pdf"] -FILE_MAX_SIZE=10 -FILE_DEFAULT_CHUNK_SIZE=512000 # 512KB - -MONGODB_URL="mongodb://admin:admin@localhost:27007" -MONGODB_DATABASE="mini-rag" - -# ========================= LLM Config ========================= -GENERATION_BACKEND = -EMBEDDING_BACKEND = - -OPENAI_API_KEY="" -OPENAI_API_URL= -COHERE_API_KEY="" - -GENERATION_MODEL_ID="gpt-3.5-turbo-0125" -EMBEDDING_MODEL_ID="embed-multilingual-light-v3.0" -EMBEDDING_MODEL_SIZE=384 - -INPUT_DAFAULT_MAX_CHARACTERS=1024 -GENERATION_DAFAULT_MAX_TOKENS=200 -GENERATION_DAFAULT_TEMPERATURE=0.1 - -# ========================= Vector DB Config ========================= -VECTOR_DB_BACKEND = -VECTOR_DB_PATH = -VECTOR_DB_DISTANCE_METHOD = +APP_NAME="mini-RAG" +APP_VERSION="0.1" +OPENAI_API_KEY="" + = +FILE_ALLOWED_TYPES=["text/plain", "application/pdf"] +FILE_MAX_SIZE=10 +FILE_DEFAULT_CHUNK_SIZE=512000 # 512KB + = +MONGODB_URL="mongodb://admin:admin@localhost:27007" +MONGODB_DATABASE="mini-rag" + = +# ========================= LLM Config ========================= +GENERATION_BACKEND = +EMBEDDING_BACKEND = + = +OPENAI_API_KEY="" +OPENAI_API_URL= +COHERE_API_KEY="" + = +GENERATION_MODEL_ID="gpt-3.5-turbo-0125" +EMBEDDING_MODEL_ID="embed-multilingual-light-v3.0" +EMBEDDING_MODEL_SIZE=384 + = +INPUT_DAFAULT_MAX_CHARACTERS=1024 +GENERATION_DAFAULT_MAX_TOKENS=200 +GENERATION_DAFAULT_TEMPERATURE=0.1 + = +# ========================= Vector DB Config ========================= +VECTOR_DB_BACKEND = +VECTOR_DB_PATH = +VECTOR_DB_DISTANCE_METHOD = + = +# ========================= Template Configs ========================= +PRIMARY_LANG = "en" +DEFAULT_LANG = "en" From ece7a7c17fd415b0c65ecc51d793081fa94dfcdf Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Thu, 15 Aug 2024 10:52:33 +0300 Subject: [PATCH 24/65] update LLM providers --- src/stores/llm/providers/CoHereProvider.py | 1 + src/stores/llm/providers/OpenAIProvider.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/stores/llm/providers/CoHereProvider.py b/src/stores/llm/providers/CoHereProvider.py index 8e52e6e9..80408dda 100644 --- a/src/stores/llm/providers/CoHereProvider.py +++ b/src/stores/llm/providers/CoHereProvider.py @@ -23,6 +23,7 @@ def __init__(self, api_key: str, self.client = cohere.Client(api_key=self.api_key) + self.enums = CoHereEnums self.logger = logging.getLogger(__name__) def set_generation_model(self, model_id: str): diff --git a/src/stores/llm/providers/OpenAIProvider.py b/src/stores/llm/providers/OpenAIProvider.py index 95a0758d..a9633040 100644 --- a/src/stores/llm/providers/OpenAIProvider.py +++ b/src/stores/llm/providers/OpenAIProvider.py @@ -24,9 +24,10 @@ def __init__(self, api_key: str, api_url: str=None, self.client = OpenAI( api_key = self.api_key, - api_url = self.api_url + base_url = self.api_url if self.api_url and len(self.api_url) else None ) + self.enums = OpenAIEnums self.logger = logging.getLogger(__name__) def set_generation_model(self, model_id: str): @@ -68,7 +69,7 @@ def generate_text(self, prompt: str, chat_history: list=[], max_output_tokens: i self.logger.error("Error while generating text with OpenAI") return None - return response.choices[0].message["content"] + return response.choices[0].message.content def embed_text(self, text: str, document_type: str = None): From 0dc576d79d7aa2daa276a76fafef3ebcc37234c2 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Thu, 15 Aug 2024 10:53:05 +0300 Subject: [PATCH 25/65] update NLPController + NLP route --- src/controllers/NLPController.py | 51 +++++++++++++++++++++++++++-- src/models/db_schemes/data_chunk.py | 6 +++- src/routes/nlp.py | 42 +++++++++++++++++++++++- 3 files changed, 94 insertions(+), 5 deletions(-) diff --git a/src/controllers/NLPController.py b/src/controllers/NLPController.py index 58b859e6..2a3b8b8b 100644 --- a/src/controllers/NLPController.py +++ b/src/controllers/NLPController.py @@ -6,12 +6,14 @@ class NLPController(BaseController): - def __init__(self, vectordb_client, generation_client, embedding_client): + def __init__(self, vectordb_client, generation_client, + embedding_client, template_parser): super().__init__() self.vectordb_client = vectordb_client self.generation_client = generation_client self.embedding_client = embedding_client + self.template_parser = template_parser def create_collection_name(self, project_id: str): return f"collection_{project_id}".strip() @@ -84,7 +86,50 @@ def search_vector_db_collection(self, project: Project, text: str, limit: int = if not results: return False - return json.loads( - json.dumps(results, default=lambda x: x.__dict__) + return results + + def answer_rag_question(self, project: Project, query: str, limit: int = 10): + + answer, full_prompt, chat_history = None, None, None + + # step1: retrieve related documents + retrieved_documents = self.search_vector_db_collection( + project=project, + text=query, + limit=limit, ) + if not retrieved_documents or len(retrieved_documents) == 0: + return answer, full_prompt, chat_history + + # step2: Construct LLM prompt + system_prompt = self.template_parser.get("rag", "system_prompt") + + documents_prompts = "\n".join([ + self.template_parser.get("rag", "document_prompt", { + "doc_num": idx + 1, + "chunk_text": doc.text, + }) + for idx, doc in enumerate(retrieved_documents) + ]) + + footer_prompt = self.template_parser.get("rag", "footer_prompt") + + # step3: Construct Generation Client Prompts + chat_history = [ + self.generation_client.construct_prompt( + prompt=system_prompt, + role=self.generation_client.enums.SYSTEM.value, + ) + ] + + full_prompt = "\n\n".join([ documents_prompts, footer_prompt]) + + # step4: Retrieve the Answer + answer = self.generation_client.generate_text( + prompt=full_prompt, + chat_history=chat_history + ) + + return answer, full_prompt, chat_history + diff --git a/src/models/db_schemes/data_chunk.py b/src/models/db_schemes/data_chunk.py index 91c0837d..853c7fa5 100644 --- a/src/models/db_schemes/data_chunk.py +++ b/src/models/db_schemes/data_chunk.py @@ -23,4 +23,8 @@ def get_indexes(cls): "name": "chunk_project_id_index_1", "unique": False } - ] \ No newline at end of file + ] + +class RetrievedDocument(BaseModel): + text: str + score: float diff --git a/src/routes/nlp.py b/src/routes/nlp.py index acca2a60..4540b6e3 100644 --- a/src/routes/nlp.py +++ b/src/routes/nlp.py @@ -126,6 +126,7 @@ async def search_index(request: Request, project_id: str, search_request: Search vectordb_client=request.app.vectordb_client, generation_client=request.app.generation_client, embedding_client=request.app.embedding_client, + template_parser=request.app.template_parser, ) results = nlp_controller.search_vector_db_collection( @@ -143,8 +144,47 @@ async def search_index(request: Request, project_id: str, search_request: Search return JSONResponse( content={ "signal": ResponseSignal.VECTORDB_SEARCH_SUCCESS.value, - "results": results + "results": [ result.dict() for result in results ] } ) +@nlp_router.post("/index/answer/{project_id}") +async def answer_rag(request: Request, project_id: str, search_request: SearchRequest): + + project_model = await ProjectModel.create_instance( + db_client=request.app.db_client + ) + + project = await project_model.get_project_or_create_one( + project_id=project_id + ) + + nlp_controller = NLPController( + vectordb_client=request.app.vectordb_client, + generation_client=request.app.generation_client, + embedding_client=request.app.embedding_client, + template_parser=request.app.template_parser, + ) + + answer, full_prompt, chat_history = nlp_controller.answer_rag_question( + project=project, + query=search_request.text, + limit=search_request.limit, + ) + if not answer: + return JSONResponse( + status_code=status.HTTP_400_BAD_REQUEST, + content={ + "signal": ResponseSignal.RAG_ANSWER_ERROR.value + } + ) + + return JSONResponse( + content={ + "signal": ResponseSignal.RAG_ANSWER_SUCCESS.value, + "answer": answer, + "full_prompt": full_prompt, + "chat_history": chat_history + } + ) From 4625a8402ac402832e79c1ce7985618ab6d946e6 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Thu, 15 Aug 2024 10:54:46 +0300 Subject: [PATCH 26/65] support getting answers from LLMs --- src/helpers/config.py | 3 ++ src/main.py | 6 +++ src/models/db_schemes/__init__.py | 2 +- src/models/enums/ResponseEnums.py | 2 + src/stores/llm/templates/__init__.py | 0 src/stores/llm/templates/locales/__init__.py | 0 .../llm/templates/locales/ar/__init__.py | 0 src/stores/llm/templates/locales/ar/rag.py | 30 +++++++++++++ .../llm/templates/locales/en/__init__.py | 0 src/stores/llm/templates/locales/en/rag.py | 30 +++++++++++++ src/stores/llm/templates/template_parser.py | 43 +++++++++++++++++++ src/stores/vectordb/VectorDBInterface.py | 3 +- .../vectordb/providers/QdrantDBProvider.py | 14 +++++- 13 files changed, 130 insertions(+), 3 deletions(-) create mode 100644 src/stores/llm/templates/__init__.py create mode 100644 src/stores/llm/templates/locales/__init__.py create mode 100644 src/stores/llm/templates/locales/ar/__init__.py create mode 100644 src/stores/llm/templates/locales/ar/rag.py create mode 100644 src/stores/llm/templates/locales/en/__init__.py create mode 100644 src/stores/llm/templates/locales/en/rag.py create mode 100644 src/stores/llm/templates/template_parser.py diff --git a/src/helpers/config.py b/src/helpers/config.py index 25d2091d..d97bd5c0 100644 --- a/src/helpers/config.py +++ b/src/helpers/config.py @@ -31,6 +31,9 @@ class Settings(BaseSettings): VECTOR_DB_PATH : str VECTOR_DB_DISTANCE_METHOD: str = None + PRIMARY_LANG: str = "en" + DEFAULT_LANG: str = "en" + class Config: env_file = ".env" diff --git a/src/main.py b/src/main.py index da010e93..e1d5ea5f 100644 --- a/src/main.py +++ b/src/main.py @@ -4,6 +4,7 @@ from helpers.config import get_settings from stores.llm.LLMProviderFactory import LLMProviderFactory from stores.vectordb.VectorDBProviderFactory import VectorDBProviderFactory +from stores.llm.templates.template_parser import TemplateParser app = FastAPI() @@ -30,6 +31,11 @@ async def startup_span(): ) app.vectordb_client.connect() + app.template_parser = TemplateParser( + language=settings.PRIMARY_LANG, + default_language=settings.DEFAULT_LANG, + ) + async def shutdown_span(): app.mongo_conn.close() diff --git a/src/models/db_schemes/__init__.py b/src/models/db_schemes/__init__.py index a4d8e59b..1d1cbe88 100644 --- a/src/models/db_schemes/__init__.py +++ b/src/models/db_schemes/__init__.py @@ -1,3 +1,3 @@ from .project import Project -from .data_chunk import DataChunk +from .data_chunk import DataChunk, RetrievedDocument from .asset import Asset diff --git a/src/models/enums/ResponseEnums.py b/src/models/enums/ResponseEnums.py index c50f53d3..5869fcad 100644 --- a/src/models/enums/ResponseEnums.py +++ b/src/models/enums/ResponseEnums.py @@ -17,4 +17,6 @@ class ResponseSignal(Enum): VECTORDB_COLLECTION_RETRIEVED = "vectordb_collection_retrieved" VECTORDB_SEARCH_ERROR = "vectordb_search_error" VECTORDB_SEARCH_SUCCESS = "vectordb_search_success" + RAG_ANSWER_ERROR = "rag_answer_error" + RAG_ANSWER_SUCCESS = "rag_answer_success" \ No newline at end of file diff --git a/src/stores/llm/templates/__init__.py b/src/stores/llm/templates/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/stores/llm/templates/locales/__init__.py b/src/stores/llm/templates/locales/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/stores/llm/templates/locales/ar/__init__.py b/src/stores/llm/templates/locales/ar/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/stores/llm/templates/locales/ar/rag.py b/src/stores/llm/templates/locales/ar/rag.py new file mode 100644 index 00000000..be9eb200 --- /dev/null +++ b/src/stores/llm/templates/locales/ar/rag.py @@ -0,0 +1,30 @@ +from string import Template + +#### RAG PROMPTS #### + +#### System #### + +system_prompt = Template("\n".join([ + "أنت مساعد لتوليد رد للمستخدم.", + "ستحصل على مجموعة من المستندات المرتبطة باستفسار المستخدم.", + "عليك توليد رد بناءً على المستندات المقدمة.", + "تجاهل المستندات التي لا تتعلق باستفسار المستخدم.", + "يمكنك الاعتذار للمستخدم إذا لم تتمكن من توليد رد.", + "عليك توليد الرد بنفس لغة استفسار المستخدم.", + "كن مؤدباً ومحترماً في التعامل مع المستخدم.", + "كن دقيقًا ومختصرًا في ردك. تجنب المعلومات غير الضرورية.", +])) + +#### Document #### +document_prompt = Template( + "\n".join([ + "## المستند رقم: $doc_num", + "### المحتوى: $chunk_text", + ]) +) + +#### Footer #### +footer_prompt = Template("\n".join([ + "بناءً فقط على المستندات المذكورة أعلاه، يرجى توليد إجابة للمستخدم.", + "## الإجابة:", +])) \ No newline at end of file diff --git a/src/stores/llm/templates/locales/en/__init__.py b/src/stores/llm/templates/locales/en/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/stores/llm/templates/locales/en/rag.py b/src/stores/llm/templates/locales/en/rag.py new file mode 100644 index 00000000..8f8f062f --- /dev/null +++ b/src/stores/llm/templates/locales/en/rag.py @@ -0,0 +1,30 @@ +from string import Template + +#### RAG PROMPTS #### + +#### System #### + +system_prompt = Template("\n".join([ + "You are an assistant to generate a response for the user.", + "You will be provided by a set of docuemnts associated with the user's query.", + "You have to generate a response based on the documents provided.", + "Ignore the documents that are not relevant to the user's query.", + "You can applogize to the user if you are not able to generate a response.", + "You have to generate response in the same language as the user's query.", + "Be polite and respectful to the user.", + "Be precise and concise in your response. Avoid unnecessary information.", +])) + +#### Document #### +document_prompt = Template( + "\n".join([ + "## Document No: $doc_num", + "### Content: $chunk_text", + ]) +) + +#### Footer #### +footer_prompt = Template("\n".join([ + "Based only on the above documents, please generate an answer for the user.", + "## Answer:", +])) \ No newline at end of file diff --git a/src/stores/llm/templates/template_parser.py b/src/stores/llm/templates/template_parser.py new file mode 100644 index 00000000..0cee58ee --- /dev/null +++ b/src/stores/llm/templates/template_parser.py @@ -0,0 +1,43 @@ +import os + +class TemplateParser: + + def __init__(self, language: str=None, default_language='en'): + self.current_path = os.path.dirname(os.path.abspath(__file__)) + self.default_language = default_language + self.language = None + + self.set_language(language) + + + def set_language(self, language: str): + if not language: + self.language = self.default_language + + language_path = os.path.join(self.current_path, "locales", language) + if os.path.exists(language_path): + self.language = language + else: + self.language = self.default_language + + def get(self, group: str, key: str, vars: dict={}): + if not group or not key: + return None + + group_path = os.path.join(self.current_path, "locales", self.language, f"{group}.py" ) + targeted_language = self.language + if not os.path.exists(group_path): + group_path = os.path.join(self.current_path, "locales", self.default_language, f"{group}.py" ) + targeted_language = self.default_language + + if not os.path.exists(group_path): + return None + + # import group module + module = __import__(f"stores.llm.templates.locales.{targeted_language}.{group}", fromlist=[group]) + + if not module: + return None + + key_attribute = getattr(module, key) + return key_attribute.substitute(vars) diff --git a/src/stores/vectordb/VectorDBInterface.py b/src/stores/vectordb/VectorDBInterface.py index ee44d47e..19eb4d9d 100644 --- a/src/stores/vectordb/VectorDBInterface.py +++ b/src/stores/vectordb/VectorDBInterface.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod from typing import List +from models.db_schemes import RetrievedDocument class VectorDBInterface(ABC): @@ -46,6 +47,6 @@ def insert_many(self, collection_name: str, texts: list, pass @abstractmethod - def search_by_vector(self, collection_name: str, vector: list, limit: int): + def search_by_vector(self, collection_name: str, vector: list, limit: int) -> List[RetrievedDocument]: pass \ No newline at end of file diff --git a/src/stores/vectordb/providers/QdrantDBProvider.py b/src/stores/vectordb/providers/QdrantDBProvider.py index 28bf30d5..7e2492a3 100644 --- a/src/stores/vectordb/providers/QdrantDBProvider.py +++ b/src/stores/vectordb/providers/QdrantDBProvider.py @@ -3,6 +3,7 @@ from ..VectorDBEnums import DistanceMethodEnums import logging from typing import List +from models.db_schemes import RetrievedDocument class QdrantDBProvider(VectorDBInterface): @@ -127,9 +128,20 @@ def insert_many(self, collection_name: str, texts: list, def search_by_vector(self, collection_name: str, vector: list, limit: int = 5): - return self.client.search( + results = self.client.search( collection_name=collection_name, query_vector=vector, limit=limit ) + if not results or len(results) == 0: + return None + + return [ + RetrievedDocument(**{ + "score": result.score, + "text": result.payload["text"], + }) + for result in results + ] + From 52c4b057b62871b3cd5dd4f5e9ba5d0fd0222372 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Mon, 23 Sep 2024 07:07:58 +0300 Subject: [PATCH 27/65] fix missed template_parser for NLPController --- src/routes/nlp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/routes/nlp.py b/src/routes/nlp.py index 4540b6e3..9d5e9a8e 100644 --- a/src/routes/nlp.py +++ b/src/routes/nlp.py @@ -42,6 +42,7 @@ async def index_project(request: Request, project_id: str, push_request: PushReq vectordb_client=request.app.vectordb_client, generation_client=request.app.generation_client, embedding_client=request.app.embedding_client, + template_parser=request.app.template_parser, ) has_records = True @@ -100,6 +101,7 @@ async def get_project_index_info(request: Request, project_id: str): vectordb_client=request.app.vectordb_client, generation_client=request.app.generation_client, embedding_client=request.app.embedding_client, + template_parser=request.app.template_parser, ) collection_info = nlp_controller.get_vector_db_collection_info(project=project) From 5d175b4a5385f2b440bd72cf06be05bbf7402135 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Mon, 23 Sep 2024 07:28:37 +0300 Subject: [PATCH 28/65] fix rag template issues --- src/controllers/NLPController.py | 4 +++- src/stores/llm/templates/locales/ar/rag.py | 3 +++ src/stores/llm/templates/locales/en/rag.py | 3 +++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/controllers/NLPController.py b/src/controllers/NLPController.py index 2a3b8b8b..1760bc2d 100644 --- a/src/controllers/NLPController.py +++ b/src/controllers/NLPController.py @@ -113,7 +113,9 @@ def answer_rag_question(self, project: Project, query: str, limit: int = 10): for idx, doc in enumerate(retrieved_documents) ]) - footer_prompt = self.template_parser.get("rag", "footer_prompt") + footer_prompt = self.template_parser.get("rag", "footer_prompt", { + "query": query + }) # step3: Construct Generation Client Prompts chat_history = [ diff --git a/src/stores/llm/templates/locales/ar/rag.py b/src/stores/llm/templates/locales/ar/rag.py index be9eb200..80c18817 100644 --- a/src/stores/llm/templates/locales/ar/rag.py +++ b/src/stores/llm/templates/locales/ar/rag.py @@ -26,5 +26,8 @@ #### Footer #### footer_prompt = Template("\n".join([ "بناءً فقط على المستندات المذكورة أعلاه، يرجى توليد إجابة للمستخدم.", + "## السؤال:", + "$query", + "", "## الإجابة:", ])) \ No newline at end of file diff --git a/src/stores/llm/templates/locales/en/rag.py b/src/stores/llm/templates/locales/en/rag.py index 8f8f062f..f784e349 100644 --- a/src/stores/llm/templates/locales/en/rag.py +++ b/src/stores/llm/templates/locales/en/rag.py @@ -26,5 +26,8 @@ #### Footer #### footer_prompt = Template("\n".join([ "Based only on the above documents, please generate an answer for the user.", + "## Question:", + "$query", + "", "## Answer:", ])) \ No newline at end of file From 46e5de944824776d7a382c9c2b349d13d71ad065 Mon Sep 17 00:00:00 2001 From: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> Date: Mon, 23 Sep 2024 11:11:31 +0300 Subject: [PATCH 29/65] update README table of contents --- README.md | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a26fdcd5..7ea5f35d 100644 --- a/README.md +++ b/README.md @@ -15,8 +15,18 @@ This is an educational project where all of the codes where explained (step by s | 5 | Welcome to FastAPI | [Video](https://www.youtube.com/watch?v=cpOuCdzN_Mo&list=PLvLvlVqNQGHCUR2p0b8a0QpVjDUg50wQj&index=5) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-002) | | 6 | Nested Routes + Env Values | [Video](https://www.youtube.com/watch?v=CrR2Bz2Y7Hw&list=PLvLvlVqNQGHCUR2p0b8a0QpVjDUg50wQj&index=6) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-003) | | 7 | Uploading a File | [Video](https://www.youtube.com/watch?v=5alMKCbFqWs&list=PLvLvlVqNQGHCUR2p0b8a0QpVjDUg50wQj&index=7) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-004) | - - +| 8 | File Processing | [Video](https://www.youtube.com/watch?v=gQgr2iwtSBw) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-005) | +| 9 | Docker - MongoDB - Motor | [Video](https://www.youtube.com/watch?v=2NOKWm0xJAk) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-006) | +| 10 | Mongo Schemes and Models | [Video](https://www.youtube.com/watch?v=zgcnnMJXXV8) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-007) | +| 11 | Mongo Indexing | [Video](https://www.youtube.com/watch?v=iO8FAmUVcjE) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-008) | +| 12 | Data Pipeline Enhancements | [Video](https://www.youtube.com/watch?v=4x1DuezZBDU) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-008) | +| 13 | Checkpoint-1 | [Video](https://www.youtube.com/watch?v=7xIsZkCisPk) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-008) | +| 14 | LLM Factory | [Video](https://www.youtube.com/watch?v=5TKRIFtIQAY) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-008) | +| 15 | Vector DB Factory | [Video](https://www.youtube.com/watch?v=JtS9UkvF_10) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-009) | +| 16 | Semantic Search | [Video](https://www.youtube.com/watch?v=V3swQKokJW8) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-010) | +| 17 | Augmented Answers | [Video](https://www.youtube.com/watch?v=1Wx8BoM5pLU) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-011) | +| 18 | Checkpoint-1 + Fix Issues | [Video](https://youtu.be/6zG4Idxldvg) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-012) | +| 19 | Ollama Local LLM Server | [Video](https://youtu.be/-epZ1hAAtrs) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-012) | ## Requirements @@ -41,6 +51,10 @@ $ conda activate mini-rag export PS1="\[\033[01;32m\]\u@\h:\w\n\[\033[00m\]\$ " ``` +### (Optional) Run Ollama Local LLM Server using Colab + Ngrok + +- Check the [notebook](https://colab.research.google.com/drive/1KNi3-9KtP-k-93T3wRcmRe37mRmGhL9p?usp=sharing) + [Video](https://youtu.be/-epZ1hAAtrs) + ## Installation ### Install the required packages From 973c8337f11e51a5fa85e3224137acf811da0159 Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Mon, 2 Dec 2024 05:50:21 +0200 Subject: [PATCH 30/65] Push 012 to main (#6) (#17) * init project directory * update .gitignore * update README * update README with CLI readability command * update requirements.txt with fastapi base packages * create assets directory * create .env base file * first API route * update instructions * update main.py * move routes to separate files + load .env values * move application files into src directory * setup the uplod endpoint * Update README.md wih youtube links * Update branches links in the README * return uploaded file id * process uploaded file * setup docker compose * setup database schemes * Push projects and data chunks into mongoDB * update docker configurations * set indecies for projects + chunks * push assets collection * change .env.example * update file processing endpoint * update requirements with openai * create the LLMInterface * create the LLMEnums * create thhe OpenAIProvider * add cohere to requirments.txt * update .env.example * update settings * fix files location * set CoHereProvider Class * setup LLMProviderFactory * update main.py * VectorDB - Qdrant Provider * nlp routes * update .env * update LLM providers * update NLPController + NLP route * support getting answers from LLMs * fix missed template_parser for NLPController * fix rag template issues * update README table of contents --------- Co-authored-by: Abu Bakr <64726450+abubakr-soliman@users.noreply.github.com> From 27d895d501fad178ce062da006380946d5a5c4eb Mon Sep 17 00:00:00 2001 From: bakrianoo Date: Mon, 2 Dec 2024 11:33:52 +0200 Subject: [PATCH 31/65] move to postgres + setup alembic --- README.md | 7 ++ docker/.env.example | 1 + docker/docker-compose.yml | 20 ++- src/.env.example | 73 +++++------ src/helpers/config.py | 7 +- src/main.py | 14 ++- src/models/db_schemes/minirag/.gitignore | 1 + src/models/db_schemes/minirag/README.md | 21 ++++ src/models/db_schemes/minirag/__init__.py | 0 .../db_schemes/minirag/alembic.ini.example | 117 ++++++++++++++++++ src/models/db_schemes/minirag/alembic/README | 1 + src/models/db_schemes/minirag/alembic/env.py | 79 ++++++++++++ .../db_schemes/minirag/alembic/script.py.mako | 26 ++++ .../versions/fee4cd54bd38_initial_commit.py | 76 ++++++++++++ .../db_schemes/minirag/schemes/__init__.py | 4 + .../db_schemes/minirag/schemes/asset.py | 31 +++++ .../db_schemes/minirag/schemes/datachunk.py | 36 ++++++ .../minirag/schemes/minirag_base.py | 2 + .../db_schemes/minirag/schemes/project.py | 14 +++ src/requirements.txt | 4 + 20 files changed, 487 insertions(+), 47 deletions(-) create mode 100644 src/models/db_schemes/minirag/.gitignore create mode 100644 src/models/db_schemes/minirag/README.md create mode 100644 src/models/db_schemes/minirag/__init__.py create mode 100644 src/models/db_schemes/minirag/alembic.ini.example create mode 100644 src/models/db_schemes/minirag/alembic/README create mode 100644 src/models/db_schemes/minirag/alembic/env.py create mode 100644 src/models/db_schemes/minirag/alembic/script.py.mako create mode 100644 src/models/db_schemes/minirag/alembic/versions/fee4cd54bd38_initial_commit.py create mode 100644 src/models/db_schemes/minirag/schemes/__init__.py create mode 100644 src/models/db_schemes/minirag/schemes/asset.py create mode 100644 src/models/db_schemes/minirag/schemes/datachunk.py create mode 100644 src/models/db_schemes/minirag/schemes/minirag_base.py create mode 100644 src/models/db_schemes/minirag/schemes/project.py diff --git a/README.md b/README.md index 7ea5f35d..bd758bb2 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,13 @@ This is an educational project where all of the codes where explained (step by s - Python 3.8 or later +#### Install Dependencies + +```bash +sudo apt update +sudo apt install libpq-dev gcc python3-dev +``` + #### Install Python using MiniConda 1) Download and install MiniConda from [here](https://docs.anaconda.com/free/miniconda/#quick-command-line-install) diff --git a/docker/.env.example b/docker/.env.example index 0cbf7103..092d0b77 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -1,2 +1,3 @@ MONGO_INITDB_ROOT_USERNAME= MONGO_INITDB_ROOT_PASSWORD= +POSTGRES_PASSWORD= diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 9905af07..4dd79847 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -1,22 +1,29 @@ services: mongodb: image: mongo:7-jammy - container_name: mongodb - ports: - "27007:27017" - volumes: - mongodata:/data/db - environment: - MONGO_INITDB_ROOT_USERNAME=${MONGO_INITDB_ROOT_USERNAME} - MONGO_INITDB_ROOT_PASSWORD=${MONGO_INITDB_ROOT_PASSWORD} - networks: - backend - + restart: always + + pgvector: + image: pgvector/pgvector:0.8.0-pg17 + container_name: pgvector + ports: + - "5432:5432" + volumes: + - pgvector_data:/var/lib/postgresql/data + environment: + - POSTGRES_PASSWORD=${POSTGRES_PASSWORD} + networks: + - backend restart: always networks: @@ -24,3 +31,4 @@ networks: volumes: mongodata: + pgvector_data: diff --git a/src/.env.example b/src/.env.example index 1d1336ac..3130d813 100644 --- a/src/.env.example +++ b/src/.env.example @@ -1,35 +1,38 @@ -APP_NAME="mini-RAG" -APP_VERSION="0.1" -OPENAI_API_KEY="" - = -FILE_ALLOWED_TYPES=["text/plain", "application/pdf"] -FILE_MAX_SIZE=10 -FILE_DEFAULT_CHUNK_SIZE=512000 # 512KB - = -MONGODB_URL="mongodb://admin:admin@localhost:27007" -MONGODB_DATABASE="mini-rag" - = -# ========================= LLM Config ========================= -GENERATION_BACKEND = -EMBEDDING_BACKEND = - = -OPENAI_API_KEY="" -OPENAI_API_URL= -COHERE_API_KEY="" - = -GENERATION_MODEL_ID="gpt-3.5-turbo-0125" -EMBEDDING_MODEL_ID="embed-multilingual-light-v3.0" -EMBEDDING_MODEL_SIZE=384 - = -INPUT_DAFAULT_MAX_CHARACTERS=1024 -GENERATION_DAFAULT_MAX_TOKENS=200 -GENERATION_DAFAULT_TEMPERATURE=0.1 - = -# ========================= Vector DB Config ========================= -VECTOR_DB_BACKEND = -VECTOR_DB_PATH = -VECTOR_DB_DISTANCE_METHOD = - = -# ========================= Template Configs ========================= -PRIMARY_LANG = "en" -DEFAULT_LANG = "en" +APP_NAME="mini-RAG" +APP_VERSION="0.1" +OPENAI_API_KEY="" + +FILE_ALLOWED_TYPES=["text/plain", "application/pdf"] +FILE_MAX_SIZE=10 +FILE_DEFAULT_CHUNK_SIZE=512000 # 512KB + +POSTGRES_USERNAME="postgres" +POSTGRES_PASSWORD="PASSWORD" +POSTGRES_HOST="localhost" +POSTGRES_PORT=5432 +POSTGRES_MAIN_DATABASE="minirag" + +# ========================= LLM Config ========================= +GENERATION_BACKEND = +EMBEDDING_BACKEND = + +OPENAI_API_KEY="" +OPENAI_API_URL= +COHERE_API_KEY="" + +GENERATION_MODEL_ID="gpt-3.5-turbo-0125" +EMBEDDING_MODEL_ID="embed-multilingual-light-v3.0" +EMBEDDING_MODEL_SIZE=384 + +INPUT_DAFAULT_MAX_CHARACTERS=1024 +GENERATION_DAFAULT_MAX_TOKENS=200 +GENERATION_DAFAULT_TEMPERATURE=0.1 + +# ========================= Vector DB Config ========================= +VECTOR_DB_BACKEND = +VECTOR_DB_PATH = +VECTOR_DB_DISTANCE_METHOD = + +# ========================= Template Configs ========================= +PRIMARY_LANG = "en" +DEFAULT_LANG = "en" diff --git a/src/helpers/config.py b/src/helpers/config.py index d97bd5c0..cc76b098 100644 --- a/src/helpers/config.py +++ b/src/helpers/config.py @@ -10,8 +10,11 @@ class Settings(BaseSettings): FILE_MAX_SIZE: int FILE_DEFAULT_CHUNK_SIZE: int - MONGODB_URL: str - MONGODB_DATABASE: str + POSTGRES_USERNAME: str + POSTGRES_PASSWORD: str + POSTGRES_HOST: str + POSTGRES_PORT: int + POSTGRES_MAIN_DATABASE: str GENERATION_BACKEND: str EMBEDDING_BACKEND: str diff --git a/src/main.py b/src/main.py index e1d5ea5f..b711641a 100644 --- a/src/main.py +++ b/src/main.py @@ -1,17 +1,23 @@ from fastapi import FastAPI from routes import base, data, nlp -from motor.motor_asyncio import AsyncIOMotorClient from helpers.config import get_settings from stores.llm.LLMProviderFactory import LLMProviderFactory from stores.vectordb.VectorDBProviderFactory import VectorDBProviderFactory from stores.llm.templates.template_parser import TemplateParser +from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession +from sqlalchemy.orm import sessionmaker app = FastAPI() async def startup_span(): settings = get_settings() - app.mongo_conn = AsyncIOMotorClient(settings.MONGODB_URL) - app.db_client = app.mongo_conn[settings.MONGODB_DATABASE] + + postgres_conn = f"postgresql+asyncpg://{settings.POSTGRES_USERNAME}:{settings.POSTGRES_PASSWORD}@{settings.POSTGRES_HOST}:{settings.POSTGRES_PORT}/{settings.POSTGRES_MAIN_DATABASE}" + + app.db_engine = create_async_engine(postgres_conn) + app.db_client = sessionmaker( + app.db_engine, class_=AsyncSession, expire_on_commit=False + ) llm_provider_factory = LLMProviderFactory(settings) vectordb_provider_factory = VectorDBProviderFactory(settings) @@ -38,7 +44,7 @@ async def startup_span(): async def shutdown_span(): - app.mongo_conn.close() + app.db_engine.dispose() app.vectordb_client.disconnect() app.on_event("startup")(startup_span) diff --git a/src/models/db_schemes/minirag/.gitignore b/src/models/db_schemes/minirag/.gitignore new file mode 100644 index 00000000..a2462558 --- /dev/null +++ b/src/models/db_schemes/minirag/.gitignore @@ -0,0 +1 @@ +alembic.ini diff --git a/src/models/db_schemes/minirag/README.md b/src/models/db_schemes/minirag/README.md new file mode 100644 index 00000000..a9437d7f --- /dev/null +++ b/src/models/db_schemes/minirag/README.md @@ -0,0 +1,21 @@ +## Run Alembic Migrations + +### Configuration + +```bash +cp alembic.ini.example alembic.ini +``` + +- Update the `alembic.ini` with your database credentials (`sqlalchemy.url`) + +### (Optional) Create a new migration + +```bash +alembic revision --autogenerate -m "Add ..." +``` + +### Upgrade the database + +```bash +alembic upgrade head +``` diff --git a/src/models/db_schemes/minirag/__init__.py b/src/models/db_schemes/minirag/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/models/db_schemes/minirag/alembic.ini.example b/src/models/db_schemes/minirag/alembic.ini.example new file mode 100644 index 00000000..0e50bdb1 --- /dev/null +++ b/src/models/db_schemes/minirag/alembic.ini.example @@ -0,0 +1,117 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +# Use forward slashes (/) also on windows to provide an os agnostic path +script_location = alembic + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. +prepend_sys_path = . + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python>=3.9 or backports.zoneinfo library. +# Any required deps can installed by adding `alembic[tz]` to the pip requirements +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to alembic/versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "version_path_separator" below. +# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions + +# version path separator; As mentioned above, this is the character used to split +# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. +# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. +# Valid values for version_path_separator are: +# +# version_path_separator = : +# version_path_separator = ; +# version_path_separator = space +# version_path_separator = newline +version_path_separator = os # Use os.pathsep. Default configuration used for new projects. + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +sqlalchemy.url = driver://user:pass@localhost/dbname + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the exec runner, execute a binary +# hooks = ruff +# ruff.type = exec +# ruff.executable = %(here)s/.venv/bin/ruff +# ruff.options = --fix REVISION_SCRIPT_FILENAME + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARNING +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARNING +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/src/models/db_schemes/minirag/alembic/README b/src/models/db_schemes/minirag/alembic/README new file mode 100644 index 00000000..98e4f9c4 --- /dev/null +++ b/src/models/db_schemes/minirag/alembic/README @@ -0,0 +1 @@ +Generic single-database configuration. \ No newline at end of file diff --git a/src/models/db_schemes/minirag/alembic/env.py b/src/models/db_schemes/minirag/alembic/env.py new file mode 100644 index 00000000..7b1c3462 --- /dev/null +++ b/src/models/db_schemes/minirag/alembic/env.py @@ -0,0 +1,79 @@ +from logging.config import fileConfig + +from sqlalchemy import engine_from_config +from sqlalchemy import pool +from schemes import SQLAlchemyBase + +from alembic import context + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +target_metadata = SQLAlchemyBase.metadata + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online() -> None: + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + connectable = engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + context.configure( + connection=connection, target_metadata=target_metadata + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/src/models/db_schemes/minirag/alembic/script.py.mako b/src/models/db_schemes/minirag/alembic/script.py.mako new file mode 100644 index 00000000..fbc4b07d --- /dev/null +++ b/src/models/db_schemes/minirag/alembic/script.py.mako @@ -0,0 +1,26 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision: str = ${repr(up_revision)} +down_revision: Union[str, None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + ${downgrades if downgrades else "pass"} diff --git a/src/models/db_schemes/minirag/alembic/versions/fee4cd54bd38_initial_commit.py b/src/models/db_schemes/minirag/alembic/versions/fee4cd54bd38_initial_commit.py new file mode 100644 index 00000000..e72bda01 --- /dev/null +++ b/src/models/db_schemes/minirag/alembic/versions/fee4cd54bd38_initial_commit.py @@ -0,0 +1,76 @@ +"""Initial Commit + +Revision ID: fee4cd54bd38 +Revises: +Create Date: 2024-12-02 11:21:07.921865 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = 'fee4cd54bd38' +down_revision: Union[str, None] = None +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('projects', + sa.Column('project_id', sa.Integer(), autoincrement=True, nullable=False), + sa.Column('project_uuid', sa.UUID(), nullable=False), + sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), + sa.Column('updated_at', sa.DateTime(timezone=True), nullable=True), + sa.PrimaryKeyConstraint('project_id'), + sa.UniqueConstraint('project_uuid') + ) + op.create_table('assets', + sa.Column('asset_id', sa.Integer(), autoincrement=True, nullable=False), + sa.Column('asset_uuid', sa.UUID(), nullable=False), + sa.Column('asset_type', sa.String(), nullable=False), + sa.Column('asset_name', sa.String(), nullable=False), + sa.Column('asset_size', sa.Integer(), nullable=False), + sa.Column('asset_config', postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column('asset_project_id', sa.Integer(), nullable=False), + sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), + sa.Column('updated_at', sa.DateTime(timezone=True), nullable=True), + sa.ForeignKeyConstraint(['asset_project_id'], ['projects.project_id'], ), + sa.PrimaryKeyConstraint('asset_id'), + sa.UniqueConstraint('asset_uuid') + ) + op.create_index('ix_asset_project_id', 'assets', ['asset_project_id'], unique=False) + op.create_index('ix_asset_type', 'assets', ['asset_type'], unique=False) + op.create_table('chunks', + sa.Column('chunk_id', sa.Integer(), autoincrement=True, nullable=False), + sa.Column('chunk_uuid', sa.UUID(), nullable=False), + sa.Column('chunk_text', sa.String(), nullable=False), + sa.Column('chunk_metadata', postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column('chunk_order', sa.Integer(), nullable=False), + sa.Column('chunk_project_id', sa.Integer(), nullable=False), + sa.Column('chunk_asset_id', sa.Integer(), nullable=False), + sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), + sa.Column('updated_at', sa.DateTime(timezone=True), nullable=True), + sa.ForeignKeyConstraint(['chunk_asset_id'], ['assets.asset_id'], ), + sa.ForeignKeyConstraint(['chunk_project_id'], ['projects.project_id'], ), + sa.PrimaryKeyConstraint('chunk_id'), + sa.UniqueConstraint('chunk_uuid') + ) + op.create_index('ix_chunk_asset_id', 'chunks', ['chunk_asset_id'], unique=False) + op.create_index('ix_chunk_project_id', 'chunks', ['chunk_project_id'], unique=False) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index('ix_chunk_project_id', table_name='chunks') + op.drop_index('ix_chunk_asset_id', table_name='chunks') + op.drop_table('chunks') + op.drop_index('ix_asset_type', table_name='assets') + op.drop_index('ix_asset_project_id', table_name='assets') + op.drop_table('assets') + op.drop_table('projects') + # ### end Alembic commands ### diff --git a/src/models/db_schemes/minirag/schemes/__init__.py b/src/models/db_schemes/minirag/schemes/__init__.py new file mode 100644 index 00000000..2a6cc0d0 --- /dev/null +++ b/src/models/db_schemes/minirag/schemes/__init__.py @@ -0,0 +1,4 @@ +from .minirag_base import SQLAlchemyBase +from .asset import Asset +from .project import Project +from .datachunk import DataChunk, RetrievedDocument diff --git a/src/models/db_schemes/minirag/schemes/asset.py b/src/models/db_schemes/minirag/schemes/asset.py new file mode 100644 index 00000000..a95f54f4 --- /dev/null +++ b/src/models/db_schemes/minirag/schemes/asset.py @@ -0,0 +1,31 @@ +from .minirag_base import SQLAlchemyBase +from sqlalchemy import Column, Integer, DateTime, func, String, ForeignKey +from sqlalchemy.dialects.postgresql import UUID, JSONB +from sqlalchemy.orm import relationship +from sqlalchemy import Index +import uuid + +class Asset(SQLAlchemyBase): + + __tablename__ = "assets" + + asset_id = Column(Integer, primary_key=True, autoincrement=True) + asset_uuid = Column(UUID(as_uuid=True), default=uuid.uuid4, unique=True, nullable=False) + + asset_type = Column(String, nullable=False) + asset_name = Column(String, nullable=False) + asset_size = Column(Integer, nullable=False) + asset_config = Column(JSONB, nullable=True) + + asset_project_id = Column(Integer, ForeignKey("projects.project_id"), nullable=False) + + created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False) + updated_at = Column(DateTime(timezone=True), onupdate=func.now(), nullable=True) + + project = relationship("Project", back_populates="assets") + + __table_args__ = ( + Index('ix_asset_project_id', asset_project_id), + Index('ix_asset_type', asset_type), + ) + diff --git a/src/models/db_schemes/minirag/schemes/datachunk.py b/src/models/db_schemes/minirag/schemes/datachunk.py new file mode 100644 index 00000000..536ca1b2 --- /dev/null +++ b/src/models/db_schemes/minirag/schemes/datachunk.py @@ -0,0 +1,36 @@ +from .minirag_base import SQLAlchemyBase +from sqlalchemy import Column, Integer, DateTime, func, String, ForeignKey +from sqlalchemy.dialects.postgresql import UUID, JSONB +from sqlalchemy.orm import relationship +from sqlalchemy import Index +from pydantic import BaseModel +import uuid + +class DataChunk(SQLAlchemyBase): + + __tablename__ = "chunks" + + chunk_id = Column(Integer, primary_key=True, autoincrement=True) + chunk_uuid = Column(UUID(as_uuid=True), default=uuid.uuid4, unique=True, nullable=False) + + chunk_text = Column(String, nullable=False) + chunk_metadata = Column(JSONB, nullable=True) + chunk_order = Column(Integer, nullable=False) + + chunk_project_id = Column(Integer, ForeignKey("projects.project_id"), nullable=False) + chunk_asset_id = Column(Integer, ForeignKey("assets.asset_id"), nullable=False) + + created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False) + updated_at = Column(DateTime(timezone=True), onupdate=func.now(), nullable=True) + + project = relationship("Project", back_populates="chunks") + asset = relationship("Asset", back_populates="chunks") + + __table_args__ = ( + Index('ix_chunk_project_id', chunk_project_id), + Index('ix_chunk_asset_id', chunk_asset_id), + ) + +class RetrievedDocument(BaseModel): + text: str + score: float diff --git a/src/models/db_schemes/minirag/schemes/minirag_base.py b/src/models/db_schemes/minirag/schemes/minirag_base.py new file mode 100644 index 00000000..e40835fe --- /dev/null +++ b/src/models/db_schemes/minirag/schemes/minirag_base.py @@ -0,0 +1,2 @@ +from sqlalchemy.ext.declarative import declarative_base +SQLAlchemyBase = declarative_base() diff --git a/src/models/db_schemes/minirag/schemes/project.py b/src/models/db_schemes/minirag/schemes/project.py new file mode 100644 index 00000000..884899e4 --- /dev/null +++ b/src/models/db_schemes/minirag/schemes/project.py @@ -0,0 +1,14 @@ +from .minirag_base import SQLAlchemyBase +from sqlalchemy import Column, Integer, DateTime, func +from sqlalchemy.dialects.postgresql import UUID +import uuid + +class Project(SQLAlchemyBase): + + __tablename__ = "projects" + + project_id = Column(Integer, primary_key=True, autoincrement=True) + project_uuid = Column(UUID(as_uuid=True), default=uuid.uuid4, unique=True, nullable=False) + + created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False) + updated_at = Column(DateTime(timezone=True), onupdate=func.now(), nullable=True) diff --git a/src/requirements.txt b/src/requirements.txt index 8ac2317d..7401d1d3 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -11,3 +11,7 @@ pydantic-mongo==2.3.0 openai==1.35.13 cohere==5.5.8 qdrant-client==1.10.1 +SQLAlchemy==2.0.36 +asyncpg==0.30.0 +alembic==1.14.0 +psycopg2==2.9.10 From 0c9cbb10f7d186956cbff06df9bd8f4cb0b7f2d4 Mon Sep 17 00:00:00 2001 From: bakrianoo Date: Mon, 2 Dec 2024 13:58:16 +0200 Subject: [PATCH 32/65] Update Data Models --- src/controllers/NLPController.py | 2 +- src/controllers/ProjectController.py | 2 +- src/models/AssetModel.py | 58 ++++++-------- src/models/ChunkModel.py | 77 +++++++----------- src/models/ProjectModel.py | 80 +++++++++---------- src/models/db_schemes/__init__.py | 4 +- src/models/db_schemes/asset.py | 37 --------- src/models/db_schemes/data_chunk.py | 30 ------- .../db_schemes/minirag/schemes/asset.py | 1 + .../db_schemes/minirag/schemes/project.py | 4 + src/models/db_schemes/project.py | 30 ------- src/routes/data.py | 20 ++--- src/routes/nlp.py | 10 +-- src/stores/llm/providers/CoHereProvider.py | 2 +- src/stores/llm/providers/OpenAIProvider.py | 2 +- 15 files changed, 113 insertions(+), 246 deletions(-) delete mode 100644 src/models/db_schemes/asset.py delete mode 100644 src/models/db_schemes/data_chunk.py delete mode 100644 src/models/db_schemes/project.py diff --git a/src/controllers/NLPController.py b/src/controllers/NLPController.py index 1760bc2d..0fa23bc2 100644 --- a/src/controllers/NLPController.py +++ b/src/controllers/NLPController.py @@ -108,7 +108,7 @@ def answer_rag_question(self, project: Project, query: str, limit: int = 10): documents_prompts = "\n".join([ self.template_parser.get("rag", "document_prompt", { "doc_num": idx + 1, - "chunk_text": doc.text, + "chunk_text": self.generation_client.process_text(doc.text), }) for idx, doc in enumerate(retrieved_documents) ]) diff --git a/src/controllers/ProjectController.py b/src/controllers/ProjectController.py index 08862cd0..b3105a57 100644 --- a/src/controllers/ProjectController.py +++ b/src/controllers/ProjectController.py @@ -11,7 +11,7 @@ def __init__(self): def get_project_path(self, project_id: str): project_dir = os.path.join( self.files_dir, - project_id + str(project_id) ) if not os.path.exists(project_dir): diff --git a/src/models/AssetModel.py b/src/models/AssetModel.py index 2185f321..594e3a96 100644 --- a/src/models/AssetModel.py +++ b/src/models/AssetModel.py @@ -2,61 +2,49 @@ from .db_schemes import Asset from .enums.DataBaseEnum import DataBaseEnum from bson import ObjectId +from sqlalchemy.future import select class AssetModel(BaseDataModel): def __init__(self, db_client: object): super().__init__(db_client=db_client) - self.collection = self.db_client[DataBaseEnum.COLLECTION_ASSET_NAME.value] + self.db_client = db_client @classmethod async def create_instance(cls, db_client: object): instance = cls(db_client) - await instance.init_collection() return instance - async def init_collection(self): - all_collections = await self.db_client.list_collection_names() - if DataBaseEnum.COLLECTION_ASSET_NAME.value not in all_collections: - self.collection = self.db_client[DataBaseEnum.COLLECTION_ASSET_NAME.value] - indexes = Asset.get_indexes() - for index in indexes: - await self.collection.create_index( - index["key"], - name=index["name"], - unique=index["unique"] - ) - async def create_asset(self, asset: Asset): - result = await self.collection.insert_one(asset.dict(by_alias=True, exclude_unset=True)) - asset.id = result.inserted_id - + async with self.db_client() as session: + async with session.begin(): + session.add(asset) + await session.commit() + await session.refresh(asset) return asset async def get_all_project_assets(self, asset_project_id: str, asset_type: str): - records = await self.collection.find({ - "asset_project_id": ObjectId(asset_project_id) if isinstance(asset_project_id, str) else asset_project_id, - "asset_type": asset_type, - }).to_list(length=None) - - return [ - Asset(**record) - for record in records - ] + async with self.db_client() as session: + stmt = select(Asset).where( + Asset.asset_project_id == asset_project_id, + Asset.asset_type == asset_type + ) + result = await session.execute(stmt) + records = result.scalars().all() + return records async def get_asset_record(self, asset_project_id: str, asset_name: str): - record = await self.collection.find_one({ - "asset_project_id": ObjectId(asset_project_id) if isinstance(asset_project_id, str) else asset_project_id, - "asset_name": asset_name, - }) - - if record: - return Asset(**record) - - return None + async with self.db_client() as session: + stmt = select(Asset).where( + Asset.asset_project_id == asset_project_id, + Asset.asset_name == asset_name + ) + result = await session.execute(stmt) + record = result.scalar_one_or_none() + return record diff --git a/src/models/ChunkModel.py b/src/models/ChunkModel.py index 37cbb23a..60ca1d82 100644 --- a/src/models/ChunkModel.py +++ b/src/models/ChunkModel.py @@ -3,76 +3,57 @@ from .enums.DataBaseEnum import DataBaseEnum from bson.objectid import ObjectId from pymongo import InsertOne +from sqlalchemy.future import select +from sqlalchemy import func, delete class ChunkModel(BaseDataModel): def __init__(self, db_client: object): super().__init__(db_client=db_client) - self.collection = self.db_client[DataBaseEnum.COLLECTION_CHUNK_NAME.value] + self.db_client = db_client @classmethod async def create_instance(cls, db_client: object): instance = cls(db_client) - await instance.init_collection() return instance - async def init_collection(self): - all_collections = await self.db_client.list_collection_names() - if DataBaseEnum.COLLECTION_CHUNK_NAME.value not in all_collections: - self.collection = self.db_client[DataBaseEnum.COLLECTION_CHUNK_NAME.value] - indexes = DataChunk.get_indexes() - for index in indexes: - await self.collection.create_index( - index["key"], - name=index["name"], - unique=index["unique"] - ) - async def create_chunk(self, chunk: DataChunk): - result = await self.collection.insert_one(chunk.dict(by_alias=True, exclude_unset=True)) - chunk._id = result.inserted_id + + async with self.db_client() as session: + async with session.begin(): + session.add(chunk) + await session.commit() + await session.refresh(chunk) return chunk async def get_chunk(self, chunk_id: str): - result = await self.collection.find_one({ - "_id": ObjectId(chunk_id) - }) - if result is None: - return None - - return DataChunk(**result) + async with self.db_client() as session: + result = await session.execute(select(DataChunk).where(DataChunk.chunk_id == chunk_id)) + chunk = result.scalar_one_or_none() + return chunk async def insert_many_chunks(self, chunks: list, batch_size: int=100): - for i in range(0, len(chunks), batch_size): - batch = chunks[i:i+batch_size] - - operations = [ - InsertOne(chunk.dict(by_alias=True, exclude_unset=True)) - for chunk in batch - ] - - await self.collection.bulk_write(operations) - + async with self.db_client() as session: + async with session.begin(): + for i in range(0, len(chunks), batch_size): + batch = chunks[i:i+batch_size] + session.add_all(batch) + await session.commit() return len(chunks) async def delete_chunks_by_project_id(self, project_id: ObjectId): - result = await self.collection.delete_many({ - "chunk_project_id": project_id - }) - - return result.deleted_count + async with self.db_client() as session: + stmt = delete(DataChunk).where(DataChunk.chunk_project_id == project_id) + result = await session.execute(stmt) + await session.commit() + return result.rowcount async def get_poject_chunks(self, project_id: ObjectId, page_no: int=1, page_size: int=50): - records = await self.collection.find({ - "chunk_project_id": project_id - }).skip( - (page_no-1) * page_size - ).limit(page_size).to_list(length=None) - - return [ - DataChunk(**record) - for record in records - ] + async with self.db_client() as session: + stmt = select(DataChunk).where(DataChunk.chunk_project_id == project_id).offset((page_no - 1) * page_size).limit(page_size) + result = await session.execute(stmt) + records = result.scalars().all() + return records diff --git a/src/models/ProjectModel.py b/src/models/ProjectModel.py index 94905d95..c3342af8 100644 --- a/src/models/ProjectModel.py +++ b/src/models/ProjectModel.py @@ -1,69 +1,61 @@ from .BaseDataModel import BaseDataModel from .db_schemes import Project from .enums.DataBaseEnum import DataBaseEnum +from sqlalchemy.future import select +from sqlalchemy import func class ProjectModel(BaseDataModel): def __init__(self, db_client: object): super().__init__(db_client=db_client) - self.collection = self.db_client[DataBaseEnum.COLLECTION_PROJECT_NAME.value] + self.db_client = db_client @classmethod async def create_instance(cls, db_client: object): instance = cls(db_client) - await instance.init_collection() return instance - async def init_collection(self): - all_collections = await self.db_client.list_collection_names() - if DataBaseEnum.COLLECTION_PROJECT_NAME.value not in all_collections: - self.collection = self.db_client[DataBaseEnum.COLLECTION_PROJECT_NAME.value] - indexes = Project.get_indexes() - for index in indexes: - await self.collection.create_index( - index["key"], - name=index["name"], - unique=index["unique"] - ) - - async def create_project(self, project: Project): - - result = await self.collection.insert_one(project.dict(by_alias=True, exclude_unset=True)) - project.id = result.inserted_id - + async with self.db_client() as session: + async with session.begin(): + session.add(project) + await session.commit() + await session.refresh(project) + return project async def get_project_or_create_one(self, project_id: str): + async with self.db_client() as session: + async with session.begin(): + query = select(Project).where(Project.project_id == project_id) + result = await session.execute(query) + project = result.scalar_one_or_none() + if project is None: + project_rec = Project( + project_id = project_id + ) + + project = await self.create_project(project=project_rec) + return project + else: + return project - record = await self.collection.find_one({ - "project_id": project_id - }) - - if record is None: - # create new project - project = Project(project_id=project_id) - project = await self.create_project(project=project) + async def get_all_projects(self, page: int=1, page_size: int=10): - return project - - return Project(**record) + async with self.db_client() as session: + async with session.begin(): - async def get_all_projects(self, page: int=1, page_size: int=10): + total_documents = await session.execute(select( + func.count( Project.project_id ) + )) - # count total number of documents - total_documents = await self.collection.count_documents({}) + total_documents = total_documents.scalar_one() - # calculate total number of pages - total_pages = total_documents // page_size - if total_documents % page_size > 0: - total_pages += 1 + total_pages = total_documents // page_size + if total_documents % page_size > 0: + total_pages += 1 - cursor = self.collection.find().skip( (page-1) * page_size ).limit(page_size) - projects = [] - async for document in cursor: - projects.append( - Project(**document) - ) + query = select(Project).offset((page - 1) * page_size ).limit(page_size) + projects = await session.execute(query).scalars().all() - return projects, total_pages + return projects, total_pages diff --git a/src/models/db_schemes/__init__.py b/src/models/db_schemes/__init__.py index 1d1cbe88..e5301c78 100644 --- a/src/models/db_schemes/__init__.py +++ b/src/models/db_schemes/__init__.py @@ -1,3 +1 @@ -from .project import Project -from .data_chunk import DataChunk, RetrievedDocument -from .asset import Asset +from models.db_schemes.minirag.schemes import Project, DataChunk, Asset, RetrievedDocument diff --git a/src/models/db_schemes/asset.py b/src/models/db_schemes/asset.py deleted file mode 100644 index cc3ca3d4..00000000 --- a/src/models/db_schemes/asset.py +++ /dev/null @@ -1,37 +0,0 @@ -from pydantic import BaseModel, Field, validator -from typing import Optional -from bson.objectid import ObjectId -from datetime import datetime - -class Asset(BaseModel): - id: Optional[ObjectId] = Field(None, alias="_id") - asset_project_id: ObjectId - asset_type: str = Field(..., min_length=1) - asset_name: str = Field(..., min_length=1) - asset_size: int = Field(ge=0, default=None) - asset_config: dict = Field(default=None) - asset_pushed_at: datetime = Field(default=datetime.utcnow) - - class Config: - arbitrary_types_allowed = True - - @classmethod - def get_indexes(cls): - - return [ - { - "key": [ - ("asset_project_id", 1) - ], - "name": "asset_project_id_index_1", - "unique": False - }, - { - "key": [ - ("asset_project_id", 1), - ("asset_name", 1) - ], - "name": "asset_project_id_name_index_1", - "unique": True - }, - ] \ No newline at end of file diff --git a/src/models/db_schemes/data_chunk.py b/src/models/db_schemes/data_chunk.py deleted file mode 100644 index 853c7fa5..00000000 --- a/src/models/db_schemes/data_chunk.py +++ /dev/null @@ -1,30 +0,0 @@ -from pydantic import BaseModel, Field, validator -from typing import Optional -from bson.objectid import ObjectId - -class DataChunk(BaseModel): - id: Optional[ObjectId] = Field(None, alias="_id") - chunk_text: str = Field(..., min_length=1) - chunk_metadata: dict - chunk_order: int = Field(..., gt=0) - chunk_project_id: ObjectId - chunk_asset_id: ObjectId - - class Config: - arbitrary_types_allowed = True - - @classmethod - def get_indexes(cls): - return [ - { - "key": [ - ("chunk_project_id", 1) - ], - "name": "chunk_project_id_index_1", - "unique": False - } - ] - -class RetrievedDocument(BaseModel): - text: str - score: float diff --git a/src/models/db_schemes/minirag/schemes/asset.py b/src/models/db_schemes/minirag/schemes/asset.py index a95f54f4..bce853bd 100644 --- a/src/models/db_schemes/minirag/schemes/asset.py +++ b/src/models/db_schemes/minirag/schemes/asset.py @@ -23,6 +23,7 @@ class Asset(SQLAlchemyBase): updated_at = Column(DateTime(timezone=True), onupdate=func.now(), nullable=True) project = relationship("Project", back_populates="assets") + chunks = relationship("DataChunk", back_populates="asset") __table_args__ = ( Index('ix_asset_project_id', asset_project_id), diff --git a/src/models/db_schemes/minirag/schemes/project.py b/src/models/db_schemes/minirag/schemes/project.py index 884899e4..c41dcf00 100644 --- a/src/models/db_schemes/minirag/schemes/project.py +++ b/src/models/db_schemes/minirag/schemes/project.py @@ -2,6 +2,7 @@ from sqlalchemy import Column, Integer, DateTime, func from sqlalchemy.dialects.postgresql import UUID import uuid +from sqlalchemy.orm import relationship class Project(SQLAlchemyBase): @@ -12,3 +13,6 @@ class Project(SQLAlchemyBase): created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False) updated_at = Column(DateTime(timezone=True), onupdate=func.now(), nullable=True) + + chunks = relationship("DataChunk", back_populates="project") + assets = relationship("Asset", back_populates="project") diff --git a/src/models/db_schemes/project.py b/src/models/db_schemes/project.py deleted file mode 100644 index 3621abec..00000000 --- a/src/models/db_schemes/project.py +++ /dev/null @@ -1,30 +0,0 @@ -from pydantic import BaseModel, Field, validator -from typing import Optional -from bson.objectid import ObjectId - -class Project(BaseModel): - id: Optional[ObjectId] = Field(None, alias="_id") - project_id: str = Field(..., min_length=1) - - @validator('project_id') - def validate_project_id(cls, value): - if not value.isalnum(): - raise ValueError('project_id must be alphanumeric') - - return value - - class Config: - arbitrary_types_allowed = True - - @classmethod - def get_indexes(cls): - - return [ - { - "key": [ - ("project_id", 1) - ], - "name": "project_id_index_1", - "unique": True - } - ] \ No newline at end of file diff --git a/src/routes/data.py b/src/routes/data.py index ea49a178..56713af7 100644 --- a/src/routes/data.py +++ b/src/routes/data.py @@ -21,7 +21,7 @@ ) @data_router.post("/upload/{project_id}") -async def upload_data(request: Request, project_id: str, file: UploadFile, +async def upload_data(request: Request, project_id: int, file: UploadFile, app_settings: Settings = Depends(get_settings)): @@ -73,7 +73,7 @@ async def upload_data(request: Request, project_id: str, file: UploadFile, ) asset_resource = Asset( - asset_project_id=project.id, + asset_project_id=project.project_id, asset_type=AssetTypeEnum.FILE.value, asset_name=file_id, asset_size=os.path.getsize(file_path) @@ -84,12 +84,12 @@ async def upload_data(request: Request, project_id: str, file: UploadFile, return JSONResponse( content={ "signal": ResponseSignal.FILE_UPLOAD_SUCCESS.value, - "file_id": str(asset_record.id), + "file_id": str(asset_record.asset_id), } ) @data_router.post("/process/{project_id}") -async def process_endpoint(request: Request, project_id: str, process_request: ProcessRequest): +async def process_endpoint(request: Request, project_id: int, process_request: ProcessRequest): chunk_size = process_request.chunk_size overlap_size = process_request.overlap_size @@ -110,7 +110,7 @@ async def process_endpoint(request: Request, project_id: str, process_request: P project_files_ids = {} if process_request.file_id: asset_record = await asset_model.get_asset_record( - asset_project_id=project.id, + asset_project_id=project.project_id, asset_name=process_request.file_id ) @@ -123,19 +123,19 @@ async def process_endpoint(request: Request, project_id: str, process_request: P ) project_files_ids = { - asset_record.id: asset_record.asset_name + asset_record.asset_id: asset_record.asset_name } else: project_files = await asset_model.get_all_project_assets( - asset_project_id=project.id, + asset_project_id=project.project_id, asset_type=AssetTypeEnum.FILE.value, ) project_files_ids = { - record.id: record.asset_name + record.asset_id: record.asset_name for record in project_files } @@ -158,7 +158,7 @@ async def process_endpoint(request: Request, project_id: str, process_request: P if do_reset == 1: _ = await chunk_model.delete_chunks_by_project_id( - project_id=project.id + project_id=project.project_id ) for asset_id, file_id in project_files_ids.items(): @@ -189,7 +189,7 @@ async def process_endpoint(request: Request, project_id: str, process_request: P chunk_text=chunk.page_content, chunk_metadata=chunk.metadata, chunk_order=i+1, - chunk_project_id=project.id, + chunk_project_id=project.project_id, chunk_asset_id=asset_id ) for i, chunk in enumerate(file_chunks) diff --git a/src/routes/nlp.py b/src/routes/nlp.py index 9d5e9a8e..c3c7e36d 100644 --- a/src/routes/nlp.py +++ b/src/routes/nlp.py @@ -16,7 +16,7 @@ ) @nlp_router.post("/index/push/{project_id}") -async def index_project(request: Request, project_id: str, push_request: PushRequest): +async def index_project(request: Request, project_id: int, push_request: PushRequest): project_model = await ProjectModel.create_instance( db_client=request.app.db_client @@ -51,7 +51,7 @@ async def index_project(request: Request, project_id: str, push_request: PushReq idx = 0 while has_records: - page_chunks = await chunk_model.get_poject_chunks(project_id=project.id, page_no=page_no) + page_chunks = await chunk_model.get_poject_chunks(project_id=project.project_id, page_no=page_no) if len(page_chunks): page_no += 1 @@ -87,7 +87,7 @@ async def index_project(request: Request, project_id: str, push_request: PushReq ) @nlp_router.get("/index/info/{project_id}") -async def get_project_index_info(request: Request, project_id: str): +async def get_project_index_info(request: Request, project_id: int): project_model = await ProjectModel.create_instance( db_client=request.app.db_client @@ -114,7 +114,7 @@ async def get_project_index_info(request: Request, project_id: str): ) @nlp_router.post("/index/search/{project_id}") -async def search_index(request: Request, project_id: str, search_request: SearchRequest): +async def search_index(request: Request, project_id: int, search_request: SearchRequest): project_model = await ProjectModel.create_instance( db_client=request.app.db_client @@ -151,7 +151,7 @@ async def search_index(request: Request, project_id: str, search_request: Search ) @nlp_router.post("/index/answer/{project_id}") -async def answer_rag(request: Request, project_id: str, search_request: SearchRequest): +async def answer_rag(request: Request, project_id: int, search_request: SearchRequest): project_model = await ProjectModel.create_instance( db_client=request.app.db_client diff --git a/src/stores/llm/providers/CoHereProvider.py b/src/stores/llm/providers/CoHereProvider.py index 80408dda..91623303 100644 --- a/src/stores/llm/providers/CoHereProvider.py +++ b/src/stores/llm/providers/CoHereProvider.py @@ -93,5 +93,5 @@ def embed_text(self, text: str, document_type: str = None): def construct_prompt(self, prompt: str, role: str): return { "role": role, - "text": self.process_text(prompt) + "text": prompt, } \ No newline at end of file diff --git a/src/stores/llm/providers/OpenAIProvider.py b/src/stores/llm/providers/OpenAIProvider.py index a9633040..89fae5e5 100644 --- a/src/stores/llm/providers/OpenAIProvider.py +++ b/src/stores/llm/providers/OpenAIProvider.py @@ -96,7 +96,7 @@ def embed_text(self, text: str, document_type: str = None): def construct_prompt(self, prompt: str, role: str): return { "role": role, - "content": self.process_text(prompt) + "content": prompt, } From ecefef361130e9773c12b931f233c67d1bf2e554 Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Fri, 3 Jan 2025 20:13:29 +0200 Subject: [PATCH 33/65] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index bd758bb2..1e2e544c 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ This is an educational project where all of the codes where explained (step by s | 17 | Augmented Answers | [Video](https://www.youtube.com/watch?v=1Wx8BoM5pLU) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-011) | | 18 | Checkpoint-1 + Fix Issues | [Video](https://youtu.be/6zG4Idxldvg) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-012) | | 19 | Ollama Local LLM Server | [Video](https://youtu.be/-epZ1hAAtrs) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-012) | +| 20 | From Mongo to Postgres + SQLAlchemy & Alembic | [Video](https://www.youtube.com/watch?v=BVOq7Ek2Up0) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-013) | ## Requirements From 70c57af8fa88a99a34145c3bdc174e61264f27bd Mon Sep 17 00:00:00 2001 From: bakrianoo Date: Mon, 17 Mar 2025 10:20:27 +0200 Subject: [PATCH 34/65] update requirements + .env --- README.md | 6 ++++++ src/.env.example | 22 ++++++++++++---------- src/helpers/config.py | 3 +++ src/requirements.txt | 4 +++- src/stores/vectordb/VectorDBEnums.py | 17 +++++++++++++++++ 5 files changed, 41 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 1e2e544c..c956f9d5 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,12 @@ $ pip install -r requirements.txt $ cp .env.example .env ``` +### Run Alembic Migration + +```bash +$ alembic upgrade head +``` + Set your environment variables in the `.env` file. Like `OPENAI_API_KEY` value. ## Run Docker Compose Services diff --git a/src/.env.example b/src/.env.example index 3130d813..a60f99d4 100644 --- a/src/.env.example +++ b/src/.env.example @@ -7,20 +7,21 @@ FILE_MAX_SIZE=10 FILE_DEFAULT_CHUNK_SIZE=512000 # 512KB POSTGRES_USERNAME="postgres" -POSTGRES_PASSWORD="PASSWORD" +POSTGRES_PASSWORD="minirag2222" POSTGRES_HOST="localhost" POSTGRES_PORT=5432 POSTGRES_MAIN_DATABASE="minirag" # ========================= LLM Config ========================= -GENERATION_BACKEND = -EMBEDDING_BACKEND = +GENERATION_BACKEND = "OPENAI" +EMBEDDING_BACKEND = "COHERE" -OPENAI_API_KEY="" +OPENAI_API_KEY="sk-" OPENAI_API_URL= -COHERE_API_KEY="" +COHERE_API_KEY="m8-" -GENERATION_MODEL_ID="gpt-3.5-turbo-0125" +GENERATION_MODEL_ID_LITERAL = ["gpt-4o-mini", "gpt-4o"] +GENERATION_MODEL_ID="gpt-4o-mini" EMBEDDING_MODEL_ID="embed-multilingual-light-v3.0" EMBEDDING_MODEL_SIZE=384 @@ -29,10 +30,11 @@ GENERATION_DAFAULT_MAX_TOKENS=200 GENERATION_DAFAULT_TEMPERATURE=0.1 # ========================= Vector DB Config ========================= -VECTOR_DB_BACKEND = -VECTOR_DB_PATH = -VECTOR_DB_DISTANCE_METHOD = +VECTOR_DB_BACKEND_LITERAL = ["QDRANT", "PGVECTOR"] +VECTOR_DB_BACKEND = "PGVECTOR" +VECTOR_DB_PATH = "qdrant_db" +VECTOR_DB_DISTANCE_METHOD = "cosine" # ========================= Template Configs ========================= -PRIMARY_LANG = "en" +PRIMARY_LANG = "ar" DEFAULT_LANG = "en" diff --git a/src/helpers/config.py b/src/helpers/config.py index cc76b098..cf58ab7d 100644 --- a/src/helpers/config.py +++ b/src/helpers/config.py @@ -1,4 +1,5 @@ from pydantic_settings import BaseSettings, SettingsConfigDict +from typing import List class Settings(BaseSettings): @@ -23,6 +24,7 @@ class Settings(BaseSettings): OPENAI_API_URL: str = None COHERE_API_KEY: str = None + GENERATION_MODEL_ID_LITERAL: List[str] = None GENERATION_MODEL_ID: str = None EMBEDDING_MODEL_ID: str = None EMBEDDING_MODEL_SIZE: int = None @@ -30,6 +32,7 @@ class Settings(BaseSettings): GENERATION_DAFAULT_MAX_TOKENS: int = None GENERATION_DAFAULT_TEMPERATURE: float = None + VECTOR_DB_BACKEND_LITERAL: List[str] = None VECTOR_DB_BACKEND : str VECTOR_DB_PATH : str VECTOR_DB_DISTANCE_METHOD: str = None diff --git a/src/requirements.txt b/src/requirements.txt index 7401d1d3..19708670 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -8,10 +8,12 @@ langchain==0.1.20 PyMuPDF==1.24.3 motor==3.4.0 pydantic-mongo==2.3.0 -openai==1.35.13 +openai==1.66.3 cohere==5.5.8 qdrant-client==1.10.1 SQLAlchemy==2.0.36 asyncpg==0.30.0 alembic==1.14.0 psycopg2==2.9.10 +pgvector==0.4.0 +nltk==3.9.1 diff --git a/src/stores/vectordb/VectorDBEnums.py b/src/stores/vectordb/VectorDBEnums.py index 808a73ff..b5b5eae5 100644 --- a/src/stores/vectordb/VectorDBEnums.py +++ b/src/stores/vectordb/VectorDBEnums.py @@ -2,7 +2,24 @@ class VectorDBEnums(Enum): QDRANT = "QDRANT" + PGVECTOR = "PGVECTOR" class DistanceMethodEnums(Enum): COSINE = "cosine" DOT = "dot" + +class PgVectorTableSchemeEnums(Enum): + ID = 'id' + TEXT = 'text' + VECTOR = 'vector' + CHUNK_ID = 'chunk_id' + METADATA = 'metadata' + _PREFIX = 'pgvector' + +class PgVectorDistanceMethodEnums(Enum): + COSINE = "vector_cosine_ops " + DOT = "vector_l2_ops" + +class PgVectorIndexTypeEnums(Enum): + HNSW = "hnsw" + IVFFLAT = "ivfflat" From c42bca6607b4adbdebfaa9c4e36fc70a15fe33f2 Mon Sep 17 00:00:00 2001 From: bakrianoo Date: Thu, 20 Mar 2025 23:39:01 +0200 Subject: [PATCH 35/65] support pgvector + enhance chunking + fix SQL errors --- src/.env.example | 81 ++--- src/controllers/NLPController.py | 44 +-- src/controllers/ProcessController.py | 54 +++- src/helpers/config.py | 1 + src/main.py | 6 +- src/models/ChunkModel.py | 10 + src/routes/data.py | 13 + src/routes/nlp.py | 28 +- src/stores/llm/providers/CoHereProvider.py | 10 +- src/stores/llm/providers/OpenAIProvider.py | 8 +- src/stores/vectordb/VectorDBEnums.py | 2 +- .../vectordb/VectorDBProviderFactory.py | 20 +- .../vectordb/providers/PGVectorProvider.py | 296 ++++++++++++++++++ .../vectordb/providers/QdrantDBProvider.py | 31 +- src/stores/vectordb/providers/__init__.py | 1 + 15 files changed, 502 insertions(+), 103 deletions(-) create mode 100644 src/stores/vectordb/providers/PGVectorProvider.py diff --git a/src/.env.example b/src/.env.example index a60f99d4..9516fae2 100644 --- a/src/.env.example +++ b/src/.env.example @@ -1,40 +1,41 @@ -APP_NAME="mini-RAG" -APP_VERSION="0.1" -OPENAI_API_KEY="" - -FILE_ALLOWED_TYPES=["text/plain", "application/pdf"] -FILE_MAX_SIZE=10 -FILE_DEFAULT_CHUNK_SIZE=512000 # 512KB - -POSTGRES_USERNAME="postgres" -POSTGRES_PASSWORD="minirag2222" -POSTGRES_HOST="localhost" -POSTGRES_PORT=5432 -POSTGRES_MAIN_DATABASE="minirag" - -# ========================= LLM Config ========================= -GENERATION_BACKEND = "OPENAI" -EMBEDDING_BACKEND = "COHERE" - -OPENAI_API_KEY="sk-" -OPENAI_API_URL= -COHERE_API_KEY="m8-" - -GENERATION_MODEL_ID_LITERAL = ["gpt-4o-mini", "gpt-4o"] -GENERATION_MODEL_ID="gpt-4o-mini" -EMBEDDING_MODEL_ID="embed-multilingual-light-v3.0" -EMBEDDING_MODEL_SIZE=384 - -INPUT_DAFAULT_MAX_CHARACTERS=1024 -GENERATION_DAFAULT_MAX_TOKENS=200 -GENERATION_DAFAULT_TEMPERATURE=0.1 - -# ========================= Vector DB Config ========================= -VECTOR_DB_BACKEND_LITERAL = ["QDRANT", "PGVECTOR"] -VECTOR_DB_BACKEND = "PGVECTOR" -VECTOR_DB_PATH = "qdrant_db" -VECTOR_DB_DISTANCE_METHOD = "cosine" - -# ========================= Template Configs ========================= -PRIMARY_LANG = "ar" -DEFAULT_LANG = "en" +APP_NAME="mini-RAG" +APP_VERSION="0.1" +OPENAI_API_KEY="sk-" + = +FILE_ALLOWED_TYPES=["text/plain", "application/pdf"] +FILE_MAX_SIZE=10 +FILE_DEFAULT_CHUNK_SIZE=512000 # 512KB + = +POSTGRES_USERNAME="postgres" +POSTGRES_PASSWORD="minirag2222" +POSTGRES_HOST="localhost" +POSTGRES_PORT=5432 +POSTGRES_MAIN_DATABASE="minirag" + = +# ========================= LLM Config ========================= +GENERATION_BACKEND = "OPENAI" +EMBEDDING_BACKEND = "COHERE" + = +OPENAI_API_KEY="sk-" +OPENAI_API_URL= +COHERE_API_KEY="m8-" + = +GENERATION_MODEL_ID_LITERAL = ["gpt-4o-mini", "gpt-4o"] +GENERATION_MODEL_ID="gpt-4o-mini" +EMBEDDING_MODEL_ID="embed-multilingual-light-v3.0" +EMBEDDING_MODEL_SIZE=384 + = +INPUT_DAFAULT_MAX_CHARACTERS=1024 +GENERATION_DAFAULT_MAX_TOKENS=200 +GENERATION_DAFAULT_TEMPERATURE=0.1 + = +# ========================= Vector DB Config ========================= +VECTOR_DB_BACKEND_LITERAL = ["QDRANT", "PGVECTOR"] +VECTOR_DB_BACKEND = "PGVECTOR" +VECTOR_DB_PATH = "qdrant_db" +VECTOR_DB_DISTANCE_METHOD = "cosine" +VECTOR_DB_PGVEC_INDEX_THRESHOLD = + = +# ========================= Template Configs ========================= +PRIMARY_LANG = "ar" +DEFAULT_LANG = "en" diff --git a/src/controllers/NLPController.py b/src/controllers/NLPController.py index 0fa23bc2..c2e49095 100644 --- a/src/controllers/NLPController.py +++ b/src/controllers/NLPController.py @@ -16,21 +16,21 @@ def __init__(self, vectordb_client, generation_client, self.template_parser = template_parser def create_collection_name(self, project_id: str): - return f"collection_{project_id}".strip() + return f"collection_{self.vectordb_client.default_vector_size}_{project_id}".strip() - def reset_vector_db_collection(self, project: Project): + async def reset_vector_db_collection(self, project: Project): collection_name = self.create_collection_name(project_id=project.project_id) - return self.vectordb_client.delete_collection(collection_name=collection_name) + return await self.vectordb_client.delete_collection(collection_name=collection_name) - def get_vector_db_collection_info(self, project: Project): + async def get_vector_db_collection_info(self, project: Project): collection_name = self.create_collection_name(project_id=project.project_id) - collection_info = self.vectordb_client.get_collection_info(collection_name=collection_name) + collection_info = await self.vectordb_client.get_collection_info(collection_name=collection_name) return json.loads( json.dumps(collection_info, default=lambda x: x.__dict__) ) - def index_into_vector_db(self, project: Project, chunks: List[DataChunk], + async def index_into_vector_db(self, project: Project, chunks: List[DataChunk], chunks_ids: List[int], do_reset: bool = False): @@ -40,21 +40,18 @@ def index_into_vector_db(self, project: Project, chunks: List[DataChunk], # step2: manage items texts = [ c.chunk_text for c in chunks ] metadata = [ c.chunk_metadata for c in chunks] - vectors = [ - self.embedding_client.embed_text(text=text, - document_type=DocumentTypeEnum.DOCUMENT.value) - for text in texts - ] + vectors = self.embedding_client.embed_text(text=texts, + document_type=DocumentTypeEnum.DOCUMENT.value) # step3: create collection if not exists - _ = self.vectordb_client.create_collection( + _ = await self.vectordb_client.create_collection( collection_name=collection_name, embedding_size=self.embedding_client.embedding_size, do_reset=do_reset, ) # step4: insert into vector db - _ = self.vectordb_client.insert_many( + _ = await self.vectordb_client.insert_many( collection_name=collection_name, texts=texts, metadata=metadata, @@ -64,22 +61,29 @@ def index_into_vector_db(self, project: Project, chunks: List[DataChunk], return True - def search_vector_db_collection(self, project: Project, text: str, limit: int = 10): + async def search_vector_db_collection(self, project: Project, text: str, limit: int = 10): # step1: get collection name + query_vector = None collection_name = self.create_collection_name(project_id=project.project_id) # step2: get text embedding vector - vector = self.embedding_client.embed_text(text=text, + vectors = self.embedding_client.embed_text(text=text, document_type=DocumentTypeEnum.QUERY.value) - if not vector or len(vector) == 0: + if not vectors or len(vectors) == 0: return False + + if isinstance(vectors, list) and len(vectors) > 0: + query_vector = vectors[0] + + if not query_vector: + return False # step3: do semantic search - results = self.vectordb_client.search_by_vector( + results = await self.vectordb_client.search_by_vector( collection_name=collection_name, - vector=vector, + vector=query_vector, limit=limit ) @@ -88,12 +92,12 @@ def search_vector_db_collection(self, project: Project, text: str, limit: int = return results - def answer_rag_question(self, project: Project, query: str, limit: int = 10): + async def answer_rag_question(self, project: Project, query: str, limit: int = 10): answer, full_prompt, chat_history = None, None, None # step1: retrieve related documents - retrieved_documents = self.search_vector_db_collection( + retrieved_documents = await self.search_vector_db_collection( project=project, text=query, limit=limit, diff --git a/src/controllers/ProcessController.py b/src/controllers/ProcessController.py index 7c2eef41..ecaa0877 100644 --- a/src/controllers/ProcessController.py +++ b/src/controllers/ProcessController.py @@ -3,8 +3,14 @@ import os from langchain_community.document_loaders import TextLoader from langchain_community.document_loaders import PyMuPDFLoader -from langchain_text_splitters import RecursiveCharacterTextSplitter from models import ProcessingEnum +from typing import List +from dataclasses import dataclass + +@dataclass +class Document: + page_content: str + metadata: dict class ProcessController(BaseController): @@ -47,12 +53,6 @@ def get_file_content(self, file_id: str): def process_file_content(self, file_content: list, file_id: str, chunk_size: int=100, overlap_size: int=20): - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=chunk_size, - chunk_overlap=overlap_size, - length_function=len, - ) - file_content_texts = [ rec.page_content for rec in file_content @@ -63,13 +63,47 @@ def process_file_content(self, file_content: list, file_id: str, for rec in file_content ] - chunks = text_splitter.create_documents( - file_content_texts, - metadatas=file_content_metadata + # chunks = text_splitter.create_documents( + # file_content_texts, + # metadatas=file_content_metadata + # ) + + chunks = self.process_simpler_splitter( + texts=file_content_texts, + metadatas=file_content_metadata, + chunk_size=chunk_size, ) return chunks + def process_simpler_splitter(self, texts: List[str], metadatas: List[dict], chunk_size: int, splitter_tag: str="\n"): + + full_text = " ".join(texts) + + # split by splitter_tag + lines = [ doc.strip() for doc in full_text.split(splitter_tag) if len(doc.strip()) > 1 ] + + chunks = [] + current_chunk = "" + + for line in lines: + current_chunk += line + splitter_tag + if len(current_chunk) >= chunk_size: + chunks.append(Document( + page_content=current_chunk.strip(), + metadata={} + )) + + current_chunk = "" + + if len(current_chunk) >= 0: + chunks.append(Document( + page_content=current_chunk.strip(), + metadata={} + )) + + return chunks + diff --git a/src/helpers/config.py b/src/helpers/config.py index cf58ab7d..5870fe9e 100644 --- a/src/helpers/config.py +++ b/src/helpers/config.py @@ -36,6 +36,7 @@ class Settings(BaseSettings): VECTOR_DB_BACKEND : str VECTOR_DB_PATH : str VECTOR_DB_DISTANCE_METHOD: str = None + VECTOR_DB_PGVEC_INDEX_THRESHOLD: int = 100 PRIMARY_LANG: str = "en" DEFAULT_LANG: str = "en" diff --git a/src/main.py b/src/main.py index b711641a..36ec33fb 100644 --- a/src/main.py +++ b/src/main.py @@ -20,7 +20,7 @@ async def startup_span(): ) llm_provider_factory = LLMProviderFactory(settings) - vectordb_provider_factory = VectorDBProviderFactory(settings) + vectordb_provider_factory = VectorDBProviderFactory(config=settings, db_client=app.db_client) # generation client app.generation_client = llm_provider_factory.create(provider=settings.GENERATION_BACKEND) @@ -35,7 +35,7 @@ async def startup_span(): app.vectordb_client = vectordb_provider_factory.create( provider=settings.VECTOR_DB_BACKEND ) - app.vectordb_client.connect() + await app.vectordb_client.connect() app.template_parser = TemplateParser( language=settings.PRIMARY_LANG, @@ -45,7 +45,7 @@ async def startup_span(): async def shutdown_span(): app.db_engine.dispose() - app.vectordb_client.disconnect() + await app.vectordb_client.disconnect() app.on_event("startup")(startup_span) app.on_event("shutdown")(shutdown_span) diff --git a/src/models/ChunkModel.py b/src/models/ChunkModel.py index 60ca1d82..8768d5fc 100644 --- a/src/models/ChunkModel.py +++ b/src/models/ChunkModel.py @@ -56,4 +56,14 @@ async def get_poject_chunks(self, project_id: ObjectId, page_no: int=1, page_siz result = await session.execute(stmt) records = result.scalars().all() return records + + async def get_total_chunks_count(self, project_id: ObjectId): + total_count = 0 + async with self.db_client() as session: + count_sql = select(func.count(DataChunk.chunk_id)).where(DataChunk.chunk_project_id == project_id) + records_count = await session.execute(count_sql) + total_count = records_count.scalar() + + return total_count + diff --git a/src/routes/data.py b/src/routes/data.py index 56713af7..b9f5de7d 100644 --- a/src/routes/data.py +++ b/src/routes/data.py @@ -12,6 +12,7 @@ from models.AssetModel import AssetModel from models.db_schemes import DataChunk, Asset from models.enums.AssetTypeEnum import AssetTypeEnum +from controllers import NLPController logger = logging.getLogger('uvicorn.error') @@ -103,6 +104,13 @@ async def process_endpoint(request: Request, project_id: int, process_request: P project_id=project_id ) + nlp_controller = NLPController( + vectordb_client=request.app.vectordb_client, + generation_client=request.app.generation_client, + embedding_client=request.app.embedding_client, + template_parser=request.app.template_parser, + ) + asset_model = await AssetModel.create_instance( db_client=request.app.db_client ) @@ -157,6 +165,11 @@ async def process_endpoint(request: Request, project_id: int, process_request: P ) if do_reset == 1: + # delete associated vectors collection + collection_name = nlp_controller.create_collection_name(project_id=project.project_id) + _ = await request.app.vectordb_client.delete_collection(collection_name=collection_name) + + # delete associated chunks _ = await chunk_model.delete_chunks_by_project_id( project_id=project.project_id ) diff --git a/src/routes/nlp.py b/src/routes/nlp.py index c3c7e36d..2cfa159f 100644 --- a/src/routes/nlp.py +++ b/src/routes/nlp.py @@ -5,6 +5,7 @@ from models.ChunkModel import ChunkModel from controllers import NLPController from models import ResponseSignal +from tqdm.auto import tqdm import logging @@ -50,6 +51,19 @@ async def index_project(request: Request, project_id: int, push_request: PushReq inserted_items_count = 0 idx = 0 + # create collection if not exists + collection_name = nlp_controller.create_collection_name(project_id=project.project_id) + + _ = await request.app.vectordb_client.create_collection( + collection_name=collection_name, + embedding_size=request.app.embedding_client.embedding_size, + do_reset=push_request.do_reset, + ) + + # setup batching + total_chunks_count = await chunk_model.get_total_chunks_count(project_id=project.project_id) + pbar = tqdm(total=total_chunks_count, desc="Vector Indexing", position=0) + while has_records: page_chunks = await chunk_model.get_poject_chunks(project_id=project.project_id, page_no=page_no) if len(page_chunks): @@ -59,13 +73,12 @@ async def index_project(request: Request, project_id: int, push_request: PushReq has_records = False break - chunks_ids = list(range(idx, idx + len(page_chunks))) + chunks_ids = [ c.chunk_id for c in page_chunks ] idx += len(page_chunks) - is_inserted = nlp_controller.index_into_vector_db( + is_inserted = await nlp_controller.index_into_vector_db( project=project, chunks=page_chunks, - do_reset=push_request.do_reset, chunks_ids=chunks_ids ) @@ -76,7 +89,8 @@ async def index_project(request: Request, project_id: int, push_request: PushReq "signal": ResponseSignal.INSERT_INTO_VECTORDB_ERROR.value } ) - + + pbar.update(len(page_chunks)) inserted_items_count += len(page_chunks) return JSONResponse( @@ -104,7 +118,7 @@ async def get_project_index_info(request: Request, project_id: int): template_parser=request.app.template_parser, ) - collection_info = nlp_controller.get_vector_db_collection_info(project=project) + collection_info = await nlp_controller.get_vector_db_collection_info(project=project) return JSONResponse( content={ @@ -131,7 +145,7 @@ async def search_index(request: Request, project_id: int, search_request: Search template_parser=request.app.template_parser, ) - results = nlp_controller.search_vector_db_collection( + results = await nlp_controller.search_vector_db_collection( project=project, text=search_request.text, limit=search_request.limit ) @@ -168,7 +182,7 @@ async def answer_rag(request: Request, project_id: int, search_request: SearchRe template_parser=request.app.template_parser, ) - answer, full_prompt, chat_history = nlp_controller.answer_rag_question( + answer, full_prompt, chat_history = await nlp_controller.answer_rag_question( project=project, query=search_request.text, limit=search_request.limit, diff --git a/src/stores/llm/providers/CoHereProvider.py b/src/stores/llm/providers/CoHereProvider.py index 91623303..ac0cac4f 100644 --- a/src/stores/llm/providers/CoHereProvider.py +++ b/src/stores/llm/providers/CoHereProvider.py @@ -2,6 +2,7 @@ from ..LLMEnums import CoHereEnums, DocumentTypeEnum import cohere import logging +from typing import List, Union class CoHereProvider(LLMInterface): @@ -64,11 +65,14 @@ def generate_text(self, prompt: str, chat_history: list=[], max_output_tokens: i return response.text - def embed_text(self, text: str, document_type: str = None): + def embed_text(self, text: Union[str, List[str]], document_type: str = None): if not self.client: self.logger.error("CoHere client was not set") return None + if isinstance(text, str): + text = [text] + if not self.embedding_model_id: self.logger.error("Embedding model for CoHere was not set") return None @@ -79,7 +83,7 @@ def embed_text(self, text: str, document_type: str = None): response = self.client.embed( model = self.embedding_model_id, - texts = [self.process_text(text)], + texts = [ self.process_text(t) for t in text ], input_type = input_type, embedding_types=['float'], ) @@ -88,7 +92,7 @@ def embed_text(self, text: str, document_type: str = None): self.logger.error("Error while embedding text with CoHere") return None - return response.embeddings.float[0] + return [ f for f in response.embeddings.float ] def construct_prompt(self, prompt: str, role: str): return { diff --git a/src/stores/llm/providers/OpenAIProvider.py b/src/stores/llm/providers/OpenAIProvider.py index 89fae5e5..c1a2b375 100644 --- a/src/stores/llm/providers/OpenAIProvider.py +++ b/src/stores/llm/providers/OpenAIProvider.py @@ -2,6 +2,7 @@ from ..LLMEnums import OpenAIEnums from openai import OpenAI import logging +from typing import List, Union class OpenAIProvider(LLMInterface): @@ -72,11 +73,14 @@ def generate_text(self, prompt: str, chat_history: list=[], max_output_tokens: i return response.choices[0].message.content - def embed_text(self, text: str, document_type: str = None): + def embed_text(self, text: Union[str, List[str]], document_type: str = None): if not self.client: self.logger.error("OpenAI client was not set") return None + + if isinstance(text, str): + text = [text] if not self.embedding_model_id: self.logger.error("Embedding model for OpenAI was not set") @@ -91,7 +95,7 @@ def embed_text(self, text: str, document_type: str = None): self.logger.error("Error while embedding text with OpenAI") return None - return response.data[0].embedding + return [ rec.embedding for rec in response.data ] def construct_prompt(self, prompt: str, role: str): return { diff --git a/src/stores/vectordb/VectorDBEnums.py b/src/stores/vectordb/VectorDBEnums.py index b5b5eae5..783e4d99 100644 --- a/src/stores/vectordb/VectorDBEnums.py +++ b/src/stores/vectordb/VectorDBEnums.py @@ -17,7 +17,7 @@ class PgVectorTableSchemeEnums(Enum): _PREFIX = 'pgvector' class PgVectorDistanceMethodEnums(Enum): - COSINE = "vector_cosine_ops " + COSINE = "vector_cosine_ops" DOT = "vector_l2_ops" class PgVectorIndexTypeEnums(Enum): diff --git a/src/stores/vectordb/VectorDBProviderFactory.py b/src/stores/vectordb/VectorDBProviderFactory.py index df2ac486..0705dff7 100644 --- a/src/stores/vectordb/VectorDBProviderFactory.py +++ b/src/stores/vectordb/VectorDBProviderFactory.py @@ -1,19 +1,31 @@ -from .providers import QdrantDBProvider +from .providers import QdrantDBProvider, PGVectorProvider from .VectorDBEnums import VectorDBEnums from controllers.BaseController import BaseController +from sqlalchemy.orm import sessionmaker class VectorDBProviderFactory: - def __init__(self, config): + def __init__(self, config, db_client: sessionmaker=None): self.config = config self.base_controller = BaseController() + self.db_client = db_client def create(self, provider: str): if provider == VectorDBEnums.QDRANT.value: - db_path = self.base_controller.get_database_path(db_name=self.config.VECTOR_DB_PATH) + qdrant_db_client = self.base_controller.get_database_path(db_name=self.config.VECTOR_DB_PATH) return QdrantDBProvider( - db_path=db_path, + db_client=qdrant_db_client, distance_method=self.config.VECTOR_DB_DISTANCE_METHOD, + default_vector_size=self.config.EMBEDDING_MODEL_SIZE, + index_threshold=self.config.VECTOR_DB_PGVEC_INDEX_THRESHOLD, + ) + + if provider == VectorDBEnums.PGVECTOR.value: + return PGVectorProvider( + db_client=self.db_client, + distance_method=self.config.VECTOR_DB_DISTANCE_METHOD, + default_vector_size=self.config.EMBEDDING_MODEL_SIZE, + index_threshold=self.config.VECTOR_DB_PGVEC_INDEX_THRESHOLD, ) return None diff --git a/src/stores/vectordb/providers/PGVectorProvider.py b/src/stores/vectordb/providers/PGVectorProvider.py new file mode 100644 index 00000000..d11d6524 --- /dev/null +++ b/src/stores/vectordb/providers/PGVectorProvider.py @@ -0,0 +1,296 @@ +from ..VectorDBInterface import VectorDBInterface +from ..VectorDBEnums import (DistanceMethodEnums, PgVectorTableSchemeEnums, + PgVectorDistanceMethodEnums, PgVectorIndexTypeEnums) +import logging +from typing import List +from models.db_schemes import RetrievedDocument +from sqlalchemy.sql import text as sql_text +import json + +class PGVectorProvider(VectorDBInterface): + + def __init__(self, db_client, default_vector_size: int = 786, + distance_method: str = None, index_threshold: int=100): + + self.db_client = db_client + self.default_vector_size = default_vector_size + + self.index_threshold = index_threshold + + if distance_method == DistanceMethodEnums.COSINE.value: + distance_method = PgVectorDistanceMethodEnums.COSINE.value + elif distance_method == DistanceMethodEnums.DOT.value: + distance_method = PgVectorDistanceMethodEnums.DOT.value + + self.pgvector_table_prefix = PgVectorTableSchemeEnums._PREFIX.value + self.distance_method = distance_method + + self.logger = logging.getLogger("uvicorn") + self.default_index_name = lambda collection_name: f"{collection_name}_vector_idx" + + + async def connect(self): + async with self.db_client() as session: + async with session.begin(): + await session.execute(sql_text( + "CREATE EXTENSION IF NOT EXISTS vector" + )) + await session.commit() + + async def disconnect(self): + pass + + async def is_collection_existed(self, collection_name: str) -> bool: + + record = None + async with self.db_client() as session: + async with session.begin(): + list_tbl = sql_text(f'SELECT * FROM pg_tables WHERE tablename = :collection_name') + results = await session.execute(list_tbl, {"collection_name": collection_name}) + record = results.scalar_one_or_none() + + return record + + async def list_all_collections(self) -> List: + records = [] + async with self.db_client() as session: + async with session.begin(): + list_tbl = sql_text('SELECT tablename FROM pg_tables WHERE tablename LIKE :prefix') + results = await session.execute(list_tbl, {"prefix": self.pgvector_table_prefix}) + records = results.scalars().all() + + return records + + async def get_collection_info(self, collection_name: str) -> dict: + async with self.db_client() as session: + async with session.begin(): + + table_info_sql = sql_text(f''' + SELECT schemaname, tablename, tableowner, tablespace, hasindexes + FROM pg_tables + WHERE tablename = :collection_name + ''') + + count_sql = sql_text(f'SELECT COUNT(*) FROM {collection_name}') + + table_info = await session.execute(table_info_sql, {"collection_name": collection_name}) + record_count = await session.execute(count_sql) + + table_data = table_info.fetchone() + if not table_data: + return None + + return { + "table_info": { + "schemaname": table_data[0], + "tablename": table_data[1], + "tableowner": table_data[2], + "tablespace": table_data[3], + "hasindexes": table_data[4], + }, + "record_count": record_count.scalar_one(), + } + + async def delete_collection(self, collection_name: str): + async with self.db_client() as session: + async with session.begin(): + self.logger.info(f"Deleting collection: {collection_name}") + + delete_sql = sql_text(f'DROP TABLE IF EXISTS {collection_name}') + await session.execute(delete_sql) + await session.commit() + + return True + + async def create_collection(self, collection_name: str, + embedding_size: int, + do_reset: bool = False): + + if do_reset: + _ = await self.delete_collection(collection_name=collection_name) + + is_collection_existed = await self.is_collection_existed(collection_name=collection_name) + if not is_collection_existed: + self.logger.info(f"Creating collection: {collection_name}") + async with self.db_client() as session: + async with session.begin(): + create_sql = sql_text( + f'CREATE TABLE {collection_name} (' + f'{PgVectorTableSchemeEnums.ID.value} bigserial PRIMARY KEY,' + f'{PgVectorTableSchemeEnums.TEXT.value} text, ' + f'{PgVectorTableSchemeEnums.VECTOR.value} vector({embedding_size}), ' + f'{PgVectorTableSchemeEnums.METADATA.value} jsonb DEFAULT \'{{}}\', ' + f'{PgVectorTableSchemeEnums.CHUNK_ID.value} integer, ' + f'FOREIGN KEY ({PgVectorTableSchemeEnums.CHUNK_ID.value}) REFERENCES chunks(chunk_id)' + ')' + ) + await session.execute(create_sql) + await session.commit() + + return True + + return False + + async def is_index_existed(self, collection_name: str) -> bool: + index_name = self.default_index_name(collection_name) + async with self.db_client() as session: + async with session.begin(): + check_sql = sql_text(f""" + SELECT 1 + FROM pg_indexes + WHERE tablename = :collection_name + AND indexname = :index_name + """) + results = await session.execute(check_sql, {"index_name": index_name, "collection_name": collection_name}) + + return bool(results.scalar_one_or_none()) + + async def create_vector_index(self, collection_name: str, + index_type: str = PgVectorIndexTypeEnums.HNSW.value): + is_index_existed = await self.is_index_existed(collection_name=collection_name) + if is_index_existed: + return False + + async with self.db_client() as session: + async with session.begin(): + count_sql = sql_text(f'SELECT COUNT(*) FROM {collection_name}') + result = await session.execute(count_sql) + records_count = result.scalar_one() + + if records_count < self.index_threshold: + return False + + self.logger.info(f"START: Creating vector index for collection: {collection_name}") + + index_name = self.default_index_name(collection_name) + create_idx_sql = sql_text( + f'CREATE INDEX {index_name} ON {collection_name} ' + f'USING {index_type} ({PgVectorTableSchemeEnums.VECTOR.value} {self.distance_method})' + ) + + await session.execute(create_idx_sql) + + self.logger.info(f"END: Created vector index for collection: {collection_name}") + + async def reset_vector_index(self, collection_name: str, + index_type: str = PgVectorIndexTypeEnums.HNSW.value) -> bool: + + index_name = self.default_index_name(collection_name) + async with self.db_client() as session: + async with session.begin(): + drop_sql = sql_text(f'DROP INDEX IF EXISTS {index_name}') + await session.execute(drop_sql) + + return await self.create_vector_index(collection_name=collection_name, index_type=index_type) + + + async def insert_one(self, collection_name: str, text: str, vector: list, + metadata: dict = None, + record_id: str = None): + + is_collection_existed = await self.is_collection_existed(collection_name=collection_name) + if not is_collection_existed: + self.logger.error(f"Can not insert new record to non-existed collection: {collection_name}") + return False + + if not record_id: + self.logger.error(f"Can not insert new record without chunk_id: {collection_name}") + return False + + async with self.db_client() as session: + async with session.begin(): + insert_sql = sql_text(f'INSERT INTO {collection_name} ' + f'({PgVectorTableSchemeEnums.TEXT.value}, {PgVectorTableSchemeEnums.VECTOR.value}, {PgVectorTableSchemeEnums.METADATA.value}, {PgVectorTableSchemeEnums.CHUNK_ID.value}) ' + 'VALUES (:text, :vector, :metadata, :chunk_id)' + ) + + metadata_json = json.dumps(metadata, ensure_ascii=False) if metadata is not None else "{}" + await session.execute(insert_sql, { + 'text': text, + 'vector': "[" + ",".join([ str(v) for v in vector ]) + "]", + 'metadata': metadata_json, + 'chunk_id': record_id + }) + await session.commit() + + await self.create_vector_index(collection_name=collection_name) + + return True + + + async def insert_many(self, collection_name: str, texts: list, + vectors: list, metadata: list = None, + record_ids: list = None, batch_size: int = 50): + + is_collection_existed = await self.is_collection_existed(collection_name=collection_name) + if not is_collection_existed: + self.logger.error(f"Can not insert new records to non-existed collection: {collection_name}") + return False + + if len(vectors) != len(record_ids): + self.logger.error(f"Invalid data items for collection: {collection_name}") + return False + + if not metadata or len(metadata) == 0: + metadata = [None] * len(texts) + + async with self.db_client() as session: + async with session.begin(): + for i in range(0, len(texts), batch_size): + batch_texts = texts[i:i+batch_size] + batch_vectors = vectors[i:i + batch_size] + batch_metadata = metadata[i:i + batch_size] + batch_record_ids = record_ids[i:i + batch_size] + + values = [] + + for _text, _vector, _metadata, _record_id in zip(batch_texts, batch_vectors, batch_metadata, batch_record_ids): + + metadata_json = json.dumps(_metadata, ensure_ascii=False) if _metadata is not None else "{}" + values.append({ + 'text': _text, + 'vector': "[" + ",".join([ str(v) for v in _vector ]) + "]", + 'metadata': metadata_json, + 'chunk_id': _record_id + }) + + batch_insert_sql = sql_text(f'INSERT INTO {collection_name} ' + f'({PgVectorTableSchemeEnums.TEXT.value}, ' + f'{PgVectorTableSchemeEnums.VECTOR.value}, ' + f'{PgVectorTableSchemeEnums.METADATA.value}, ' + f'{PgVectorTableSchemeEnums.CHUNK_ID.value}) ' + f'VALUES (:text, :vector, :metadata, :chunk_id)') + + await session.execute(batch_insert_sql, values) + + await self.create_vector_index(collection_name=collection_name) + + return True + + async def search_by_vector(self, collection_name: str, vector: list, limit: int): + + is_collection_existed = await self.is_collection_existed(collection_name=collection_name) + if not is_collection_existed: + self.logger.error(f"Can not search for records in a non-existed collection: {collection_name}") + return False + + vector = "[" + ",".join([ str(v) for v in vector ]) + "]" + async with self.db_client() as session: + async with session.begin(): + search_sql = sql_text(f'SELECT {PgVectorTableSchemeEnums.TEXT.value} as text, 1 - ({PgVectorTableSchemeEnums.VECTOR.value} <=> :vector) as score' + f' FROM {collection_name}' + ' ORDER BY score DESC ' + f'LIMIT {limit}' + ) + + result = await session.execute(search_sql, {"vector": vector}) + + records = result.fetchall() + + return [ + RetrievedDocument( + text=record.text, + score=record.score + ) + for record in records + ] diff --git a/src/stores/vectordb/providers/QdrantDBProvider.py b/src/stores/vectordb/providers/QdrantDBProvider.py index 7e2492a3..57146481 100644 --- a/src/stores/vectordb/providers/QdrantDBProvider.py +++ b/src/stores/vectordb/providers/QdrantDBProvider.py @@ -7,45 +7,50 @@ class QdrantDBProvider(VectorDBInterface): - def __init__(self, db_path: str, distance_method: str): + def __init__(self, db_client: str, default_vector_size: int = 786, + distance_method: str = None, index_threshold: int=100): self.client = None - self.db_path = db_path + self.db_client = db_client self.distance_method = None + self.default_vector_size = default_vector_size if distance_method == DistanceMethodEnums.COSINE.value: self.distance_method = models.Distance.COSINE elif distance_method == DistanceMethodEnums.DOT.value: self.distance_method = models.Distance.DOT - self.logger = logging.getLogger(__name__) + self.logger = logging.getLogger('uvicorn') - def connect(self): - self.client = QdrantClient(path=self.db_path) + async def connect(self): + self.client = QdrantClient(path=self.db_client) - def disconnect(self): + async def disconnect(self): self.client = None - def is_collection_existed(self, collection_name: str) -> bool: + async def is_collection_existed(self, collection_name: str) -> bool: return self.client.collection_exists(collection_name=collection_name) - def list_all_collections(self) -> List: + async def list_all_collections(self) -> List: return self.client.get_collections() def get_collection_info(self, collection_name: str) -> dict: return self.client.get_collection(collection_name=collection_name) - def delete_collection(self, collection_name: str): + async def delete_collection(self, collection_name: str): if self.is_collection_existed(collection_name): + self.logger.info(f"Deleting collection: {collection_name}") return self.client.delete_collection(collection_name=collection_name) - def create_collection(self, collection_name: str, + async def create_collection(self, collection_name: str, embedding_size: int, do_reset: bool = False): if do_reset: _ = self.delete_collection(collection_name=collection_name) if not self.is_collection_existed(collection_name): + self.logger.info(f"Creating new Qdrant collection: {collection_name}") + _ = self.client.create_collection( collection_name=collection_name, vectors_config=models.VectorParams( @@ -58,7 +63,7 @@ def create_collection(self, collection_name: str, return False - def insert_one(self, collection_name: str, text: str, vector: list, + async def insert_one(self, collection_name: str, text: str, vector: list, metadata: dict = None, record_id: str = None): @@ -85,7 +90,7 @@ def insert_one(self, collection_name: str, text: str, vector: list, return True - def insert_many(self, collection_name: str, texts: list, + async def insert_many(self, collection_name: str, texts: list, vectors: list, metadata: list = None, record_ids: list = None, batch_size: int = 50): @@ -126,7 +131,7 @@ def insert_many(self, collection_name: str, texts: list, return True - def search_by_vector(self, collection_name: str, vector: list, limit: int = 5): + async def search_by_vector(self, collection_name: str, vector: list, limit: int = 5): results = self.client.search( collection_name=collection_name, diff --git a/src/stores/vectordb/providers/__init__.py b/src/stores/vectordb/providers/__init__.py index 139cec85..75bfb8e1 100644 --- a/src/stores/vectordb/providers/__init__.py +++ b/src/stores/vectordb/providers/__init__.py @@ -1 +1,2 @@ from .QdrantDBProvider import QdrantDBProvider +from .PGVectorProvider import PGVectorProvider From 90ab865e437b5ea88f761acbafcf40f807b4a82d Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Fri, 21 Mar 2025 21:35:33 +0200 Subject: [PATCH 36/65] update README --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c956f9d5..69e741fc 100644 --- a/README.md +++ b/README.md @@ -28,11 +28,12 @@ This is an educational project where all of the codes where explained (step by s | 18 | Checkpoint-1 + Fix Issues | [Video](https://youtu.be/6zG4Idxldvg) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-012) | | 19 | Ollama Local LLM Server | [Video](https://youtu.be/-epZ1hAAtrs) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-012) | | 20 | From Mongo to Postgres + SQLAlchemy & Alembic | [Video](https://www.youtube.com/watch?v=BVOq7Ek2Up0) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-013) | +| 21 | The way to PgVector | [Video](https://www.youtube.com/watch?v=g99yq5zlYAE) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-014) | ## Requirements -- Python 3.8 or later +- Python 3.10 #### Install Dependencies @@ -46,7 +47,7 @@ sudo apt install libpq-dev gcc python3-dev 1) Download and install MiniConda from [here](https://docs.anaconda.com/free/miniconda/#quick-command-line-install) 2) Create a new environment using the following command: ```bash -$ conda create -n mini-rag python=3.8 +$ conda create -n mini-rag python=3.10 ``` 3) Activate the environment: ```bash From 06dbb3af9be9a8e6d500076bbee0802f6d7f92d9 Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Fri, 18 Apr 2025 12:39:25 +0200 Subject: [PATCH 37/65] setup docker compose for production --- docker/.env.example | 3 - docker/.gitignore | 5 +- docker/README.md | 183 ++++++++++++++++++++++ docker/docker-compose.yml | 139 ++++++++++++++-- docker/env/.env.example.app | 40 +++++ docker/env/.env.example.grafana | 4 + docker/env/.env.example.postgres | 4 + docker/env/.env.example.postgres-exporter | 3 + docker/minirag/Dockerfile | 35 +++++ docker/minirag/alembic.example.ini | 117 ++++++++++++++ docker/minirag/entrypoint.sh | 7 + docker/nginx/default.conf | 17 ++ docker/prometheus/prometheus.yml | 27 ++++ src/main.py | 6 + src/requirements.txt | 7 +- src/utils/__init__.py | 0 src/utils/mertics.py | 36 +++++ 17 files changed, 614 insertions(+), 19 deletions(-) delete mode 100644 docker/.env.example create mode 100644 docker/README.md create mode 100644 docker/env/.env.example.app create mode 100644 docker/env/.env.example.grafana create mode 100644 docker/env/.env.example.postgres create mode 100644 docker/env/.env.example.postgres-exporter create mode 100644 docker/minirag/Dockerfile create mode 100644 docker/minirag/alembic.example.ini create mode 100644 docker/minirag/entrypoint.sh create mode 100644 docker/nginx/default.conf create mode 100644 docker/prometheus/prometheus.yml create mode 100644 src/utils/__init__.py create mode 100644 src/utils/mertics.py diff --git a/docker/.env.example b/docker/.env.example deleted file mode 100644 index 092d0b77..00000000 --- a/docker/.env.example +++ /dev/null @@ -1,3 +0,0 @@ -MONGO_INITDB_ROOT_USERNAME= -MONGO_INITDB_ROOT_PASSWORD= -POSTGRES_PASSWORD= diff --git a/docker/.gitignore b/docker/.gitignore index 26477da4..975aa757 100644 --- a/docker/.gitignore +++ b/docker/.gitignore @@ -1,2 +1,3 @@ -mongodb -.env +env/.env* +!env/.env.example.* +minirag/alembic.ini diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 00000000..fba41cad --- /dev/null +++ b/docker/README.md @@ -0,0 +1,183 @@ +# Docker Setup for MiniRAG Application + +This directory contains the Docker setup for the MiniRAG application, including all necessary services for development and monitoring. + +## Services + +- **FastAPI Application**: Main application running on Uvicorn +- **Nginx**: Web server for serving the FastAPI application +- **PostgreSQL (pgvector)**: Vector-enabled database for storing embeddings +- **Postgres-Exporter**: Exports PostgreSQL metrics for Prometheus +- **Qdrant**: Vector database for similarity search +- **Prometheus**: Metrics collection +- **Grafana**: Visualization dashboard for metrics +- **Node-Exporter**: System metrics collection + +## Setup Instructions + +### 1. Set up environment files + +Create your environment files from the examples: + +```bash +# Create all required .env files from examples +cd docker/env +cp .env.example.app .env.app +cp .env.example.postgres .env.postgres +cp .env.example.grafana .env.grafana +cp .env.example.postgres-exporter .env.postgres-exporter + +# Setup the Alembic configuration for the FastAPI application +cd .. +cd docker/minirag +cp alembic.example.ini alembic.ini + +### 2. Start the services + +```bash +cd docker +docker compose up --build -d +``` + +To start only specific services: + +```bash +docker compose up -d fastapi nginx pgvector qdrant +``` + +If you encounter connection issues, you may want to start the database services first and let them initialize before starting the application: + +```bash +# Start databases first +docker compose up -d pgvector qdrant postgres-exporter +# Wait for databases to be healthy +sleep 30 +# Start the application services +docker compose up fastapi nginx prometheus grafana node-exporter --build -d +``` + +In case deleting all containers and volumes is necessary, you can run: + +```bash +docker compose down -v --remove-orphans +``` + +### 3. Access the services + +- FastAPI Application: http://localhost:8000 +- FastAPI Documentation: http://localhost:8000/docs +- Nginx (serving FastAPI): http://localhost +- Prometheus: http://localhost:9090 +- Grafana: http://localhost:3000 +- Qdrant UI: http://localhost:6333/dashboard + +## Volume Management + +### Managing Docker Volumes + +Docker volumes are used to persist data generated by and used by Docker containers. Here are some commands to manage your volumes: + +1. **List all volumes**: + ```bash + docker volume ls + ``` +2. **Inspect a volume**: + ```bash + docker volume inspect + ``` + + - list files in a volume: + ```bash + docker run --rm -v :/data busybox ls -l /data + ``` + +3. **Remove a volume**: + ```bash + docker volume rm + ``` +4. **Prune unused volumes**: + ```bash + docker volume prune + ``` + +5. **Backup volume for migration**: + ```bash + docker run --rm -v :/volume -v $(pwd):/backup alpine tar cvf /backup/backup.tar /volume + ``` + +6. **Restore volume from backup**: + ```bash + docker run --rm -v :/volume -v $(pwd):/backup alpine sh -c "cd /volume && tar xvf /backup/backup.tar --strip 1" + ``` + +7. **Remove all volumes**: + ```bash + docker volume rm $(docker volume ls -q) + ``` + +**NOTE**: For PostgreSQL specifically, you might want to consider using PostgreSQL's built-in tools like `pg_dump` and `pg_restore` for more reliable backups, especially for live databases. + +## Monitoring + +### FastAPI Metrics + +FastAPI is configured to expose Prometheus metrics at the `/metrics` endpoint. These metrics include: + +- Request counts +- Request latencies +- Status codes + +Prometheus is configured to scrape these metrics automatically. + +### Visualizing Metrics in Grafana + +1. Log into Grafana at http://localhost:3000 (default credentials: admin/admin_password) +2. Add Prometheus as a data source (URL: http://prometheus:9090) +3. Import dashboards for FastAPI, PostgreSQL, and Qdrant + +#### Dashboards URLs + +https://grafana.com/grafana/dashboards/18739-fastapi-observability/ + +https://grafana.com/grafana/dashboards/1860-node-exporter-full/ + +https://grafana.com/grafana/dashboards/23033-qdrant/ + +https://grafana.com/grafana/dashboards/12485-postgresql-exporter/ + + +## Development Workflow + +The FastAPI application is configured with hot-reloading. Any changes to the code in the `src/` directory will automatically reload the application. + +## Troubleshooting + +### Connection Errors + +If you see connection errors when starting the services: + +1. **Database Connection Refused**: This often happens when the FastAPI app tries to connect to databases before they're ready. + ``` + Connection refused: [Errno 111] Connection refused + ``` + + Solutions: + - Start database services first, wait, then start the application + - Check database logs: `docker compose logs pgvector` + - Ensure your database credentials in `.env.app` match those in `.env.postgres` + +2. **Restart the FastAPI service** after databases are running: + ```bash + docker compose restart fastapi + ``` + +3. **Check service status**: + ```bash + docker compose ps + ``` + +4. **View logs** for more details: + ```bash + docker compose logs --tail=100 fastapi + docker compose logs --tail=100 pgvector + ``` diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 4dd79847..7c54655a 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -1,34 +1,147 @@ services: - mongodb: - image: mongo:7-jammy - container_name: mongodb + # FastAPI Application + fastapi: + build: + context: .. + dockerfile: docker/minirag/Dockerfile + container_name: fastapi ports: - - "27007:27017" + - "8000:8000" volumes: - - mongodata:/data/db - environment: - - MONGO_INITDB_ROOT_USERNAME=${MONGO_INITDB_ROOT_USERNAME} - - MONGO_INITDB_ROOT_PASSWORD=${MONGO_INITDB_ROOT_PASSWORD} + - fastapi_data:/app/assets networks: - backend restart: always + depends_on: + pgvector: + condition: service_healthy + env_file: + - ./env/.env.app + # Nginx Service + nginx: + image: nginx:stable-alpine3.20-perl + container_name: nginx + ports: + - "80:80" + volumes: + - ./nginx/default.conf:/etc/nginx/conf.d/default.conf + depends_on: + - fastapi + networks: + - backend + restart: always + + + # PostgreSQL (pgvector) pgvector: image: pgvector/pgvector:0.8.0-pg17 container_name: pgvector ports: - "5432:5432" volumes: - - pgvector_data:/var/lib/postgresql/data - environment: - - POSTGRES_PASSWORD=${POSTGRES_PASSWORD} + - 'pgvector:/var/lib/postgresql/data' + env_file: + - ./env/.env.postgres + networks: + - backend + restart: always + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 5s + timeout: 5s + retries: 5 + start_period: 10s + + # Qdrant (VectorDB) + qdrant: + image: qdrant/qdrant:v1.13.6 + container_name: qdrant + ports: + - "6333:6333" + - "6334:6334" + volumes: + - qdrant_data:/qdrant/storage + networks: + - backend + restart: always + + # Prometheus Monitoring + prometheus: + image: prom/prometheus:v3.3.0 + container_name: prometheus + ports: + - "9090:9090" + volumes: + - prometheus_data:/prometheus + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + networks: + - backend + restart: always + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--web.enable-lifecycle' + + # Grafana Dashboard + grafana: + image: grafana/grafana:11.6.0-ubuntu + container_name: grafana + ports: + - "3000:3000" + volumes: + - grafana_data:/var/lib/grafana + env_file: + - ./env/.env.grafana + depends_on: + - prometheus + networks: + - backend + restart: always + + + # Node Exporter for system metrics + node-exporter: + image: prom/node-exporter:v1.9.1 + container_name: node-exporter + ports: + - "9100:9100" + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - '--path.procfs=/host/proc' + - '--path.rootfs=/rootfs' + - '--path.sysfs=/host/sys' + - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' + networks: + - backend + restart: always + + # PostgreSQL Exporter for Postgres metrics + postgres-exporter: + image: prometheuscommunity/postgres-exporter:v0.17.1 + container_name: postgres-exporter + ports: + - "9187:9187" + env_file: + - ./env/.env.postgres-exporter + depends_on: + - pgvector networks: - backend restart: always networks: backend: + driver: bridge volumes: - mongodata: - pgvector_data: + fastapi_data: + pgvector: + qdrant_data: + prometheus_data: + grafana_data: diff --git a/docker/env/.env.example.app b/docker/env/.env.example.app new file mode 100644 index 00000000..4b9494ed --- /dev/null +++ b/docker/env/.env.example.app @@ -0,0 +1,40 @@ +APP_NAME="mini-RAG" +APP_VERSION="0.1" + +FILE_ALLOWED_TYPES=["text/plain", "application/pdf"] +FILE_MAX_SIZE=10 +FILE_DEFAULT_CHUNK_SIZE=512000 # 512KB + +POSTGRES_USERNAME="postgres" +POSTGRES_PASSWORD="postgres_password" +POSTGRES_HOST="pgvector" +POSTGRES_PORT=5432 +POSTGRES_MAIN_DATABASE="minirag" + +# ========================= LLM Config ========================= +GENERATION_BACKEND = "OPENAI" +EMBEDDING_BACKEND = "COHERE" + +OPENAI_API_KEY="key___" +OPENAI_API_URL= "" +COHERE_API_KEY="key___" + +GENERATION_MODEL_ID_LITERAL = ["gpt-4o-mini", "gemma2:9b-instruct-q5_0"] +GENERATION_MODEL_ID="gpt-4o-mini" +EMBEDDING_MODEL_ID="embed-multilingual-v3.0" +EMBEDDING_MODEL_SIZE=1024 + +INPUT_DAFAULT_MAX_CHARACTERS=1024 +GENERATION_DAFAULT_MAX_TOKENS=200 +GENERATION_DAFAULT_TEMPERATURE=0.1 + +# ========================= Vector DB Config ========================= +VECTOR_DB_BACKEND_LITERAL = ["QDRANT", "PGVECTOR"] +VECTOR_DB_BACKEND = "PGVECTOR" +VECTOR_DB_PATH = "qdrant_db" +VECTOR_DB_DISTANCE_METHOD = "cosine" +VECTOR_DB_PGVEC_INDEX_THRESHOLD = 100 + +# ========================= Template Config ========================= +PRIMARY_LANG = "en" +DEFAULT_LANG = "en" diff --git a/docker/env/.env.example.grafana b/docker/env/.env.example.grafana new file mode 100644 index 00000000..127bedc9 --- /dev/null +++ b/docker/env/.env.example.grafana @@ -0,0 +1,4 @@ +# Grafana Environment Variables +GF_SECURITY_ADMIN_USER=admin +GF_SECURITY_ADMIN_PASSWORD=admin_password +GF_USERS_ALLOW_SIGN_UP=false diff --git a/docker/env/.env.example.postgres b/docker/env/.env.example.postgres new file mode 100644 index 00000000..cf4f155e --- /dev/null +++ b/docker/env/.env.example.postgres @@ -0,0 +1,4 @@ +# PostgreSQL Environment Variables +POSTGRES_USER=postgres +POSTGRES_PASSWORD=postgres_password +POSTGRES_DB=minirag diff --git a/docker/env/.env.example.postgres-exporter b/docker/env/.env.example.postgres-exporter new file mode 100644 index 00000000..27483a27 --- /dev/null +++ b/docker/env/.env.example.postgres-exporter @@ -0,0 +1,3 @@ +DATA_SOURCE_URI=pgvector:5432/postgres?sslmode=disable +DATA_SOURCE_USER=postgres +DATA_SOURCE_PASS=postgres_password \ No newline at end of file diff --git a/docker/minirag/Dockerfile b/docker/minirag/Dockerfile new file mode 100644 index 00000000..7447fbab --- /dev/null +++ b/docker/minirag/Dockerfile @@ -0,0 +1,35 @@ +FROM ghcr.io/astral-sh/uv:0.6.14-python3.10-bookworm + +WORKDIR /app + +# Install additional system dependencies for lxml and other packages +RUN apt-get update && apt-get install -y \ + build-essential \ + libavif-dev pkg-config \ + libjpeg-dev \ + gcc unzip zip \ + python3-dev \ + libxml2-dev \ + libxslt1-dev \ + libffi-dev \ + curl \ + && rm -rf /var/lib/apt/lists/* + +COPY src/requirements.txt . + +RUN uv pip install -r requirements.txt --system + +COPY src/ . + +# Create directory structure for Alembic +RUN mkdir -p /app/models/db_schemes/minirag/ + +COPY docker/minirag/alembic.ini /app/models/db_schemes/minirag/alembic.ini + +COPY docker/minirag/entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +ENTRYPOINT ["/entrypoint.sh"] + +# Command to run the application +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4"] diff --git a/docker/minirag/alembic.example.ini b/docker/minirag/alembic.example.ini new file mode 100644 index 00000000..346ec24f --- /dev/null +++ b/docker/minirag/alembic.example.ini @@ -0,0 +1,117 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +# Use forward slashes (/) also on windows to provide an os agnostic path +script_location = alembic + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. +prepend_sys_path = . + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python>=3.9 or backports.zoneinfo library. +# Any required deps can installed by adding `alembic[tz]` to the pip requirements +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to alembic/versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "version_path_separator" below. +# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions + +# version path separator; As mentioned above, this is the character used to split +# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. +# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. +# Valid values for version_path_separator are: +# +# version_path_separator = : +# version_path_separator = ; +# version_path_separator = space +# version_path_separator = newline +version_path_separator = os # Use os.pathsep. Default configuration used for new projects. + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +sqlalchemy.url = postgresql://postgres:postgres_password@pgvector:5432/minirag + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the exec runner, execute a binary +# hooks = ruff +# ruff.type = exec +# ruff.executable = %(here)s/.venv/bin/ruff +# ruff.options = --fix REVISION_SCRIPT_FILENAME + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARNING +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARNING +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/docker/minirag/entrypoint.sh b/docker/minirag/entrypoint.sh new file mode 100644 index 00000000..198c8cb0 --- /dev/null +++ b/docker/minirag/entrypoint.sh @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +echo "Running database migrations..." +cd /app/models/db_schemes/minirag/ +alembic upgrade head +cd /app diff --git a/docker/nginx/default.conf b/docker/nginx/default.conf new file mode 100644 index 00000000..87632089 --- /dev/null +++ b/docker/nginx/default.conf @@ -0,0 +1,17 @@ +server { + listen 80; + server_name localhost; + + location / { + proxy_pass http://fastapi:8000; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # Optionally expose metrics endpoint directly + location /TrhBVe_m5gg2002_E5VVqS { + proxy_pass http://fastapi:8000/TrhBVe_m5gg2002_E5VVqS; + } +} \ No newline at end of file diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml new file mode 100644 index 00000000..802caa6d --- /dev/null +++ b/docker/prometheus/prometheus.yml @@ -0,0 +1,27 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + + - job_name: 'fastapi' + static_configs: + - targets: ['fastapi:8000'] + metrics_path: '/TrhBVe_m5gg2002_E5VVqS' + + - job_name: 'node-exporter' + static_configs: + - targets: ['node-exporter:9100'] + + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'qdrant' + static_configs: + - targets: ['qdrant:6333'] + metrics_path: '/metrics' + + - job_name: 'postgres' + static_configs: + - targets: ['postgres-exporter:9187'] diff --git a/src/main.py b/src/main.py index 36ec33fb..ffd2c2e3 100644 --- a/src/main.py +++ b/src/main.py @@ -7,8 +7,14 @@ from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession from sqlalchemy.orm import sessionmaker +# Import metrics setup +from utils.metrics import setup_metrics + app = FastAPI() +# Setup Prometheus metrics +setup_metrics(app) + async def startup_span(): settings = get_settings() diff --git a/src/requirements.txt b/src/requirements.txt index 19708670..e2dc704d 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -8,7 +8,7 @@ langchain==0.1.20 PyMuPDF==1.24.3 motor==3.4.0 pydantic-mongo==2.3.0 -openai==1.66.3 +openai==1.75.0 cohere==5.5.8 qdrant-client==1.10.1 SQLAlchemy==2.0.36 @@ -17,3 +17,8 @@ alembic==1.14.0 psycopg2==2.9.10 pgvector==0.4.0 nltk==3.9.1 + +# Monitoring and metrics +prometheus-client==0.21.1 +starlette-exporter==0.23.0 +fastapi-health==0.4.0 diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/utils/mertics.py b/src/utils/mertics.py new file mode 100644 index 00000000..1b189f3d --- /dev/null +++ b/src/utils/mertics.py @@ -0,0 +1,36 @@ +from prometheus_client import Counter, Histogram, generate_latest, CONTENT_TYPE_LATEST +from fastapi import FastAPI, Request, Response +from starlette.middleware.base import BaseHTTPMiddleware +import time + +# Define metrics +REQUEST_COUNT = Counter('http_requests_total', 'Total HTTP Requests', ['method', 'endpoint', 'status']) +REQUEST_LATENCY = Histogram('http_request_duration_seconds', 'HTTP Request Latency', ['method', 'endpoint']) + +class PrometheusMiddleware(BaseHTTPMiddleware): + async def dispatch(self, request: Request, call_next): + + start_time = time.time() + + # Process the request + response = await call_next(request) + + # Record metrics after request is processed + duration = time.time() - start_time + endpoint = request.url.path + + REQUEST_LATENCY.labels(method=request.method, endpoint=endpoint).observe(duration) + REQUEST_COUNT.labels(method=request.method, endpoint=endpoint, status=response.status_code).inc() + + return response + +def setup_metrics(app: FastAPI): + """ + Setup Prometheus metrics middleware and endpoint + """ + # Add Prometheus middleware + app.add_middleware(PrometheusMiddleware) + + @app.get("/TrhBVe_m5gg2002_E5VVqS", include_in_schema=False) + def metrics(): + return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST) From 31df1c40fb139c8749888c816f31d69f3d92001d Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Sun, 8 Jun 2025 18:45:00 +0300 Subject: [PATCH 38/65] fix file name --- src/utils/{mertics.py => metrics.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/utils/{mertics.py => metrics.py} (100%) diff --git a/src/utils/mertics.py b/src/utils/metrics.py similarity index 100% rename from src/utils/mertics.py rename to src/utils/metrics.py From dd1cbef08b532e899fcfbd89bf3e515fd192a6c5 Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Sun, 8 Jun 2025 18:47:28 +0300 Subject: [PATCH 39/65] set github actions flows sample --- .github/.workflows/deploy-main.yml | 46 ++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 .github/.workflows/deploy-main.yml diff --git a/.github/.workflows/deploy-main.yml b/.github/.workflows/deploy-main.yml new file mode 100644 index 00000000..73335756 --- /dev/null +++ b/.github/.workflows/deploy-main.yml @@ -0,0 +1,46 @@ +name: Deploy Main Branch to Server + +on: + push: + branches: + - develop + +jobs: + deploy: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4.2.2 + + - name: Deploy via SSH + uses: appleboy/ssh-action@v1.2.2 + with: + host: ${{ secrets.SSH_MAIN_HOST_IP }} + username: github_user + key: ${{ secrets.SSH_MAIN_PRIVATE_KEY }} + script: | + cd /home/github_user/workspace/mini-rag + git checkout main + git pull + sudo systemctl restart minirag.service + echo "Waiting..." + sleep 20 + + for i in {1..6}; do + if ss -tuln | grep -q ':80'; then + echo "✅ Port 80 is now active." + break + else + echo "⏳ Port 80 not ready yet. Retrying in 5 seconds..." + sleep 5 + fi + done + + if ! ss -tuln | grep -q ':80'; then + echo "❌ Service failed to start on port 80" + exit 1 + fi + + + From 39ee29316ef30ca294db8cd5a7261896ef2a7d0c Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Sun, 8 Jun 2025 18:48:25 +0300 Subject: [PATCH 40/65] update github actions filename --- .../.workflows/{deploy-main.yml => deploy-develop.yml} | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename .github/.workflows/{deploy-main.yml => deploy-develop.yml} (84%) diff --git a/.github/.workflows/deploy-main.yml b/.github/.workflows/deploy-develop.yml similarity index 84% rename from .github/.workflows/deploy-main.yml rename to .github/.workflows/deploy-develop.yml index 73335756..f71ac894 100644 --- a/.github/.workflows/deploy-main.yml +++ b/.github/.workflows/deploy-develop.yml @@ -1,4 +1,4 @@ -name: Deploy Main Branch to Server +name: Deploy Develop Branch to Server on: push: @@ -16,12 +16,12 @@ jobs: - name: Deploy via SSH uses: appleboy/ssh-action@v1.2.2 with: - host: ${{ secrets.SSH_MAIN_HOST_IP }} + host: ${{ secrets.SSH_DEVELOP_HOST_IP }} username: github_user - key: ${{ secrets.SSH_MAIN_PRIVATE_KEY }} + key: ${{ secrets.SSH_DEVELOP_PRIVATE_KEY }} script: | cd /home/github_user/workspace/mini-rag - git checkout main + git checkout develop git pull sudo systemctl restart minirag.service echo "Waiting..." From f468cd859ba4575c7eaf2edeb2fe627b9e174b58 Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Sun, 8 Jun 2025 18:49:10 +0300 Subject: [PATCH 41/65] update PGVectorProvider connect function --- .../vectordb/providers/PGVectorProvider.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/stores/vectordb/providers/PGVectorProvider.py b/src/stores/vectordb/providers/PGVectorProvider.py index d11d6524..137a3046 100644 --- a/src/stores/vectordb/providers/PGVectorProvider.py +++ b/src/stores/vectordb/providers/PGVectorProvider.py @@ -31,11 +31,21 @@ def __init__(self, db_client, default_vector_size: int = 786, async def connect(self): async with self.db_client() as session: - async with session.begin(): - await session.execute(sql_text( - "CREATE EXTENSION IF NOT EXISTS vector" + try: + # Check if vector extension already exists + result = await session.execute(sql_text( + "SELECT 1 FROM pg_extension WHERE extname = 'vector'" )) - await session.commit() + extension_exists = result.scalar_one_or_none() + + if not extension_exists: + # Only create if it doesn't exist + await session.execute(sql_text("CREATE EXTENSION vector")) + await session.commit() + except Exception as e: + # If extension already exists or any other error, just log and continue + self.logger.warning(f"Vector extension setup: {str(e)}") + await session.rollback() async def disconnect(self): pass From 5b47644dda7fd9a0a7f1dc7cdb0995ba59656a61 Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Sun, 8 Jun 2025 18:50:26 +0300 Subject: [PATCH 42/65] set service file sample --- docker/minirag.service | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 docker/minirag.service diff --git a/docker/minirag.service b/docker/minirag.service new file mode 100644 index 00000000..6d998bd3 --- /dev/null +++ b/docker/minirag.service @@ -0,0 +1,23 @@ +[Unit] +Description=MiniRAG Docker Service +After=network.target docker.service +Requires=docker.service + +[Service] +Type=forking +RemainAfterExit=yes +User=github_user +Group=docker +WorkingDirectory=/home/github_user/workspace/mini-rag/docker +ExecStartPre=/bin/bash -c '/usr/bin/docker compose down || true' +ExecStartPre=/bin/sleep 5 +ExecStart=/usr/bin/docker compose up --build -d +ExecStop=/usr/bin/docker compose down +ExecReload=/usr/bin/docker compose restart +TimeoutStartSec=300 +TimeoutStopSec=120 +Restart=on-failure +RestartSec=10 + +[Install] +WantedBy=multi-user.target \ No newline at end of file From d89794c79e90e4edf56cf8c273a230efaa1bf04d Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Sat, 2 Aug 2025 13:23:00 +0300 Subject: [PATCH 43/65] setup reddis + rabbitMQ docker services --- docker/docker-compose.yml | 43 +++++++++++++++++++++++++++++++- docker/env/.env.example.rabbitmq | 13 ++++++++++ docker/env/.env.example.redis | 12 +++++++++ docker/rabbitmq/rabbitmq.conf | 18 +++++++++++++ 4 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 docker/env/.env.example.rabbitmq create mode 100644 docker/env/.env.example.redis create mode 100644 docker/rabbitmq/rabbitmq.conf diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 7c54655a..dc010e25 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -38,7 +38,7 @@ services: image: pgvector/pgvector:0.8.0-pg17 container_name: pgvector ports: - - "5432:5432" + - "5400:5432" volumes: - 'pgvector:/var/lib/postgresql/data' env_file: @@ -135,6 +135,45 @@ services: - backend restart: always + # RabbitMQ (Message Broker) + rabbitmq: + image: rabbitmq:4.1.2-management-alpine + container_name: rabbitmq + ports: + - "5672:5672" # AMQP port + - "15672:15672" # Management UI port + volumes: + - rabbitmq_data:/var/lib/rabbitmq + - ./rabbitmq/rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf + env_file: + - ./env/.env.rabbitmq + networks: + - backend + restart: always + healthcheck: + test: ["CMD", "rabbitmq-diagnostics", "ping"] + timeout: 10s + retries: 5 + + # Redis (Results Backend & Cache) + redis: + image: redis:8.0.3-alpine + container_name: redis + ports: + - "6379:6379" + volumes: + - redis_data:/data + env_file: + - ./env/.env.redis + networks: + - backend + restart: always + healthcheck: + test: ["CMD", "redis-cli", "ping"] + timeout: 10s + retries: 5 + command: ["redis-server", "--appendonly", "yes", "--requirepass", "${REDIS_PASSWORD:-minirag_redis_2222}"] + networks: backend: driver: bridge @@ -145,3 +184,5 @@ volumes: qdrant_data: prometheus_data: grafana_data: + rabbitmq_data: + redis_data: diff --git a/docker/env/.env.example.rabbitmq b/docker/env/.env.example.rabbitmq new file mode 100644 index 00000000..9c8e1174 --- /dev/null +++ b/docker/env/.env.example.rabbitmq @@ -0,0 +1,13 @@ +# RabbitMQ Configuration Example +RABBITMQ_DEFAULT_USER=minirag_user +RABBITMQ_DEFAULT_PASS=minirag_rabbitmq_2222 +RABBITMQ_DEFAULT_VHOST=minirag_vhost + +# Management Plugin +RABBITMQ_MANAGEMENT_ENABLED=true + +# Security +RABBITMQ_AUTH_BACKENDS=rabbit_auth_backend_internal + +# Performance +RABBITMQ_DISK_FREE_LIMIT=2000000000 diff --git a/docker/env/.env.example.redis b/docker/env/.env.example.redis new file mode 100644 index 00000000..01ce9b38 --- /dev/null +++ b/docker/env/.env.example.redis @@ -0,0 +1,12 @@ +# Redis Configuration Example +REDIS_PASSWORD=minirag_redis_2222 + +# Persistence +REDIS_APPENDONLY=yes + +# Memory Management +REDIS_MAXMEMORY=512mb +REDIS_MAXMEMORY_POLICY=allkeys-lru + +# Security +REDIS_PROTECTED_MODE=yes diff --git a/docker/rabbitmq/rabbitmq.conf b/docker/rabbitmq/rabbitmq.conf new file mode 100644 index 00000000..5cf61d57 --- /dev/null +++ b/docker/rabbitmq/rabbitmq.conf @@ -0,0 +1,18 @@ +# RabbitMQ Configuration File + +# Memory management +vm_memory_high_watermark.relative = 0.6 + +# Disk space management +disk_free_limit.absolute = 2GB + +# SSL/TLS configuration +ssl_options.verify = verify_none + +# Management plugin +management.tcp.port = 15672 + +# Logging +log.file.level = info +log.console = true +log.console.level = info From 4ffe00c116619c41c3710a76aa8ae88dfec1c13f Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Sat, 2 Aug 2025 13:30:56 +0300 Subject: [PATCH 44/65] update requirements packags --- src/requirements.txt | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/requirements.txt b/src/requirements.txt index e2dc704d..c9e2bccd 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -14,7 +14,7 @@ qdrant-client==1.10.1 SQLAlchemy==2.0.36 asyncpg==0.30.0 alembic==1.14.0 -psycopg2==2.9.10 +psycopg2-binary==2.9.10 pgvector==0.4.0 nltk==3.9.1 @@ -22,3 +22,10 @@ nltk==3.9.1 prometheus-client==0.21.1 starlette-exporter==0.23.0 fastapi-health==0.4.0 + +# Task Queue and Background Processing +celery==5.5.3 +redis==6.2.0 +kombu==5.5.4 +billiard==4.2.1 +vine==5.1.0 From c631fad5593823b4bd2130f5058dfc2090bdd36b Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Sat, 2 Aug 2025 14:19:09 +0300 Subject: [PATCH 45/65] setup basic celery app --- src/.env.example | 88 +++++++++++++++++++++++-------------------- src/celery_app.py | 42 +++++++++++++++++++++ src/helpers/config.py | 8 ++++ src/requirements.txt | 2 +- 4 files changed, 99 insertions(+), 41 deletions(-) create mode 100644 src/celery_app.py diff --git a/src/.env.example b/src/.env.example index 9516fae2..af2d102d 100644 --- a/src/.env.example +++ b/src/.env.example @@ -1,41 +1,49 @@ -APP_NAME="mini-RAG" -APP_VERSION="0.1" -OPENAI_API_KEY="sk-" - = -FILE_ALLOWED_TYPES=["text/plain", "application/pdf"] -FILE_MAX_SIZE=10 -FILE_DEFAULT_CHUNK_SIZE=512000 # 512KB - = -POSTGRES_USERNAME="postgres" -POSTGRES_PASSWORD="minirag2222" -POSTGRES_HOST="localhost" -POSTGRES_PORT=5432 -POSTGRES_MAIN_DATABASE="minirag" - = -# ========================= LLM Config ========================= -GENERATION_BACKEND = "OPENAI" -EMBEDDING_BACKEND = "COHERE" - = -OPENAI_API_KEY="sk-" -OPENAI_API_URL= -COHERE_API_KEY="m8-" - = -GENERATION_MODEL_ID_LITERAL = ["gpt-4o-mini", "gpt-4o"] -GENERATION_MODEL_ID="gpt-4o-mini" -EMBEDDING_MODEL_ID="embed-multilingual-light-v3.0" -EMBEDDING_MODEL_SIZE=384 - = -INPUT_DAFAULT_MAX_CHARACTERS=1024 -GENERATION_DAFAULT_MAX_TOKENS=200 -GENERATION_DAFAULT_TEMPERATURE=0.1 - = -# ========================= Vector DB Config ========================= -VECTOR_DB_BACKEND_LITERAL = ["QDRANT", "PGVECTOR"] -VECTOR_DB_BACKEND = "PGVECTOR" -VECTOR_DB_PATH = "qdrant_db" -VECTOR_DB_DISTANCE_METHOD = "cosine" +APP_NAME="mini-RAG" +APP_VERSION="0.1" +OPENAI_API_KEY="sk-" + +FILE_ALLOWED_TYPES=["text/plain", "application/pdf"] +FILE_MAX_SIZE=10 +FILE_DEFAULT_CHUNK_SIZE=512000 # 512KB + +POSTGRES_USERNAME="postgres" +POSTGRES_PASSWORD="minirag2222" +POSTGRES_HOST="localhost" +POSTGRES_PORT=5432 +POSTGRES_MAIN_DATABASE="minirag" + +# ========================= LLM Config ========================= +GENERATION_BACKEND = "OPENAI" +EMBEDDING_BACKEND = "COHERE" + +OPENAI_API_KEY="sk-" +OPENAI_API_URL= +COHERE_API_KEY="m8-" + +GENERATION_MODEL_ID_LITERAL = ["gpt-4o-mini", "gpt-4o"] +GENERATION_MODEL_ID="gpt-4o-mini" +EMBEDDING_MODEL_ID="embed-multilingual-light-v3.0" +EMBEDDING_MODEL_SIZE=384 + +INPUT_DAFAULT_MAX_CHARACTERS=1024 +GENERATION_DAFAULT_MAX_TOKENS=200 +GENERATION_DAFAULT_TEMPERATURE=0.1 + +# ========================= Vector DB Config ========================= +VECTOR_DB_BACKEND_LITERAL = ["QDRANT", "PGVECTOR"] +VECTOR_DB_BACKEND = "PGVECTOR" +VECTOR_DB_PATH = "qdrant_db" +VECTOR_DB_DISTANCE_METHOD = "cosine" VECTOR_DB_PGVEC_INDEX_THRESHOLD = - = -# ========================= Template Configs ========================= -PRIMARY_LANG = "ar" -DEFAULT_LANG = "en" + +# ========================= Template Configs ========================= +PRIMARY_LANG = "ar" +DEFAULT_LANG = "en" + +# ========================= Celery Task Queue Config ========================= +CELERY_BROKER_URL="amqp://minirag_user:minirag_rabbitmq_2222@localhost:5672/minirag_vhost" +CELERY_RESULT_BACKEND="redis://:minirag_redis_2222@localhost:6379/0" +CELERY_TASK_SERIALIZER="json" +CELERY_TASK_TIME_LIMIT=600 +CELERY_TASK_ACKS_LATE=false +CELERY_WORKER_CONCURRENCY=2 diff --git a/src/celery_app.py b/src/celery_app.py new file mode 100644 index 00000000..63367282 --- /dev/null +++ b/src/celery_app.py @@ -0,0 +1,42 @@ +from celery import Celery +from helpers.config import get_settings + +settings = get_settings() + +# Create Celery application instance +celery_app = Celery( + "minirag", + broker=settings.CELERY_BROKER_URL, + backend=settings.CELERY_RESULT_BACKEND, +) + +# Configure Celery with essential settings +celery_app.conf.update( + task_serializer=settings.CELERY_TASK_SERIALIZER, + result_serializer=settings.CELERY_TASK_SERIALIZER, + accept_content=[ + settings.CELERY_TASK_SERIALIZER + ], + + # Task safety - Late acknowledgment prevents task loss on worker crash + task_acks_late=settings.CELERY_TASK_ACKS_LATE, + + # Time limits - Prevent hanging tasks + task_time_limit=settings.CELERY_TASK_TIME_LIMIT, + + # Result backend - Store results for status tracking + task_ignore_resul=False, + result_expires=3600, + + # Worker settings + worker_concurrency=settings.CELERY_WORKER_CONCURRENCY, + + # Connection settings for better reliability + broker_connection_retry_on_startup=True, + broker_connection_retry=True, + broker_connection_max_retries=10, + worker_cancel_long_running_tasks_on_connection_loss=True, + +) + +celery_app.conf.task_default_queue = "default" \ No newline at end of file diff --git a/src/helpers/config.py b/src/helpers/config.py index 5870fe9e..3f4eac1b 100644 --- a/src/helpers/config.py +++ b/src/helpers/config.py @@ -41,6 +41,14 @@ class Settings(BaseSettings): PRIMARY_LANG: str = "en" DEFAULT_LANG: str = "en" + # Celery Configuration + CELERY_BROKER_URL: str = None + CELERY_RESULT_BACKEND: str = None + CELERY_TASK_SERIALIZER: str = "json" + CELERY_TASK_TIME_LIMIT: int = 600 + CELERY_TASK_ACKS_LATE: bool = True + CELERY_WORKER_CONCURRENCY: int = 2 + class Config: env_file = ".env" diff --git a/src/requirements.txt b/src/requirements.txt index c9e2bccd..3b30cc93 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -14,7 +14,7 @@ qdrant-client==1.10.1 SQLAlchemy==2.0.36 asyncpg==0.30.0 alembic==1.14.0 -psycopg2-binary==2.9.10 +psycopg2==2.9.10 pgvector==0.4.0 nltk==3.9.1 From fce71afb710a8d91688ed6e57192a80ee5c456a7 Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Sat, 2 Aug 2025 21:27:01 +0300 Subject: [PATCH 46/65] demo celery app for sending reports --- src/celery_app.py | 6 ++++++ src/routes/base.py | 19 +++++++++++++++++++ src/tasks/__init__.py | 0 src/tasks/mail_service.py | 36 ++++++++++++++++++++++++++++++++++++ 4 files changed, 61 insertions(+) create mode 100644 src/tasks/__init__.py create mode 100644 src/tasks/mail_service.py diff --git a/src/celery_app.py b/src/celery_app.py index 63367282..cac539fd 100644 --- a/src/celery_app.py +++ b/src/celery_app.py @@ -1,6 +1,12 @@ from celery import Celery from helpers.config import get_settings +from stores.llm.LLMProviderFactory import LLMProviderFactory +from stores.vectordb.VectorDBProviderFactory import VectorDBProviderFactory +from stores.llm.templates.template_parser import TemplateParser +from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession +from sqlalchemy.orm import sessionmaker + settings = get_settings() # Create Celery application instance diff --git a/src/routes/base.py b/src/routes/base.py index 773797ef..475fc256 100644 --- a/src/routes/base.py +++ b/src/routes/base.py @@ -1,6 +1,11 @@ from fastapi import FastAPI, APIRouter, Depends import os from helpers.config import get_settings, Settings +from time import sleep +import logging +from tasks.mail_service import send_email_reports + +logger = logging.getLogger('uvicorn.error') base_router = APIRouter( prefix="/api/v1", @@ -17,3 +22,17 @@ async def welcome(app_settings: Settings = Depends(get_settings)): "app_name": app_name, "app_version": app_version, } + +@base_router.get("/send_reports") +async def send_reports(app_settings: Settings = Depends(get_settings)): + + # ==== START ==== send reports + task = send_email_reports.delay( + mail_wait_seconds=3 + ) + # ==== END ==== send reports + + return { + "success": True, + "task_id": task.id + } \ No newline at end of file diff --git a/src/tasks/__init__.py b/src/tasks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/tasks/mail_service.py b/src/tasks/mail_service.py new file mode 100644 index 00000000..ca22429f --- /dev/null +++ b/src/tasks/mail_service.py @@ -0,0 +1,36 @@ +from celery_app import celery_app +from helpers.config import get_settings +from time import sleep +import logging +from datetime import datetime +import asyncio + +logger = logging.getLogger('celery.task') + +@celery_app.task(bind=True, name="tasks.mail_service.send_email_reports") +def send_email_reports(self, mail_wait_seconds: int): + + return asyncio.run(_send_email_reports(self, mail_wait_seconds)) + + +async def _send_email_reports(task_instance, mail_wait_seconds: int): + + started_at = str(datetime.now()) + + task_instance.update_state( + state="PROGRESS", + meta={ + "started_at": started_at + } + ) + + # ==== START ==== send reports + for ix in range(15): + logger.info(f"Send email to user: {ix}") + await asyncio.sleep(mail_wait_seconds) + # ==== END ==== send reports + + return { + "no_emails": 15, + "end_at": str(datetime.now()) + } \ No newline at end of file From a99e107b7239e8c12c4d3702b0456a2b78f952f5 Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Sat, 2 Aug 2025 21:28:29 +0300 Subject: [PATCH 47/65] set include parameter for send_reports email service --- src/celery_app.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/celery_app.py b/src/celery_app.py index cac539fd..9cc341d3 100644 --- a/src/celery_app.py +++ b/src/celery_app.py @@ -14,6 +14,9 @@ "minirag", broker=settings.CELERY_BROKER_URL, backend=settings.CELERY_RESULT_BACKEND, + include=[ + "tasks.mail_service" + ] ) # Configure Celery with essential settings From f3e574192b7e2abd0257f00e96d9fd5d39608971 Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Sat, 2 Aug 2025 21:29:35 +0300 Subject: [PATCH 48/65] remove mail_service celery demo task --- src/routes/base.py | 14 -------------- src/tasks/mail_service.py | 36 ------------------------------------ 2 files changed, 50 deletions(-) delete mode 100644 src/tasks/mail_service.py diff --git a/src/routes/base.py b/src/routes/base.py index 475fc256..b4972d60 100644 --- a/src/routes/base.py +++ b/src/routes/base.py @@ -22,17 +22,3 @@ async def welcome(app_settings: Settings = Depends(get_settings)): "app_name": app_name, "app_version": app_version, } - -@base_router.get("/send_reports") -async def send_reports(app_settings: Settings = Depends(get_settings)): - - # ==== START ==== send reports - task = send_email_reports.delay( - mail_wait_seconds=3 - ) - # ==== END ==== send reports - - return { - "success": True, - "task_id": task.id - } \ No newline at end of file diff --git a/src/tasks/mail_service.py b/src/tasks/mail_service.py deleted file mode 100644 index ca22429f..00000000 --- a/src/tasks/mail_service.py +++ /dev/null @@ -1,36 +0,0 @@ -from celery_app import celery_app -from helpers.config import get_settings -from time import sleep -import logging -from datetime import datetime -import asyncio - -logger = logging.getLogger('celery.task') - -@celery_app.task(bind=True, name="tasks.mail_service.send_email_reports") -def send_email_reports(self, mail_wait_seconds: int): - - return asyncio.run(_send_email_reports(self, mail_wait_seconds)) - - -async def _send_email_reports(task_instance, mail_wait_seconds: int): - - started_at = str(datetime.now()) - - task_instance.update_state( - state="PROGRESS", - meta={ - "started_at": started_at - } - ) - - # ==== START ==== send reports - for ix in range(15): - logger.info(f"Send email to user: {ix}") - await asyncio.sleep(mail_wait_seconds) - # ==== END ==== send reports - - return { - "no_emails": 15, - "end_at": str(datetime.now()) - } \ No newline at end of file From 1623ddf8de5e5b4a4de784c33bd89be50623a917 Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Sun, 3 Aug 2025 19:41:56 +0300 Subject: [PATCH 49/65] push file processing celery task --- src/celery_app.py | 42 +++++++- src/routes/base.py | 1 - src/routes/data.py | 23 +++++ src/tasks/file_processing.py | 188 +++++++++++++++++++++++++++++++++++ 4 files changed, 252 insertions(+), 2 deletions(-) create mode 100644 src/tasks/file_processing.py diff --git a/src/celery_app.py b/src/celery_app.py index 9cc341d3..64f4f004 100644 --- a/src/celery_app.py +++ b/src/celery_app.py @@ -9,13 +9,49 @@ settings = get_settings() +async def get_setup_utils(): + settings = get_settings() + + postgres_conn = f"postgresql+asyncpg://{settings.POSTGRES_USERNAME}:{settings.POSTGRES_PASSWORD}@{settings.POSTGRES_HOST}:{settings.POSTGRES_PORT}/{settings.POSTGRES_MAIN_DATABASE}" + + db_engine = create_async_engine(postgres_conn) + db_client = sessionmaker( + db_engine, class_=AsyncSession, expire_on_commit=False + ) + + llm_provider_factory = LLMProviderFactory(settings) + vectordb_provider_factory = VectorDBProviderFactory(config=settings, db_client=db_client) + + # generation client + generation_client = llm_provider_factory.create(provider=settings.GENERATION_BACKEND) + generation_client.set_generation_model(model_id = settings.GENERATION_MODEL_ID) + + # embedding client + embedding_client = llm_provider_factory.create(provider=settings.EMBEDDING_BACKEND) + embedding_client.set_embedding_model(model_id=settings.EMBEDDING_MODEL_ID, + embedding_size=settings.EMBEDDING_MODEL_SIZE) + + # vector db client + vectordb_client = vectordb_provider_factory.create( + provider=settings.VECTOR_DB_BACKEND + ) + await vectordb_client.connect() + + template_parser = TemplateParser( + language=settings.PRIMARY_LANG, + default_language=settings.DEFAULT_LANG, + ) + + return (db_engine, db_client, llm_provider_factory, vectordb_provider_factory, + generation_client, embedding_client, vectordb_client, template_parser) + # Create Celery application instance celery_app = Celery( "minirag", broker=settings.CELERY_BROKER_URL, backend=settings.CELERY_RESULT_BACKEND, include=[ - "tasks.mail_service" + "tasks.file_processing" ] ) @@ -46,6 +82,10 @@ broker_connection_max_retries=10, worker_cancel_long_running_tasks_on_connection_loss=True, + task_routes={ + "tasks.file_processing.process_project_files": {"queue": "file_processing"} + } + ) celery_app.conf.task_default_queue = "default" \ No newline at end of file diff --git a/src/routes/base.py b/src/routes/base.py index b4972d60..45d009e9 100644 --- a/src/routes/base.py +++ b/src/routes/base.py @@ -3,7 +3,6 @@ from helpers.config import get_settings, Settings from time import sleep import logging -from tasks.mail_service import send_email_reports logger = logging.getLogger('uvicorn.error') diff --git a/src/routes/data.py b/src/routes/data.py index b9f5de7d..e849cc3e 100644 --- a/src/routes/data.py +++ b/src/routes/data.py @@ -13,6 +13,7 @@ from models.db_schemes import DataChunk, Asset from models.enums.AssetTypeEnum import AssetTypeEnum from controllers import NLPController +from tasks.file_processing import process_project_files logger = logging.getLogger('uvicorn.error') @@ -96,6 +97,28 @@ async def process_endpoint(request: Request, project_id: int, process_request: P overlap_size = process_request.overlap_size do_reset = process_request.do_reset + task = process_project_files.delay( + project_id=project_id, + file_id=process_request.file_id, + chunk_size=chunk_size, + overlap_size=overlap_size, + do_reset=do_reset, + ) + + return JSONResponse( + content={ + "signal": ResponseSignal.PROCESSING_SUCCESS.value, + "task_id": task.id + } + ) + + + + + + + + project_model = await ProjectModel.create_instance( db_client=request.app.db_client ) diff --git a/src/tasks/file_processing.py b/src/tasks/file_processing.py new file mode 100644 index 00000000..855390c2 --- /dev/null +++ b/src/tasks/file_processing.py @@ -0,0 +1,188 @@ +from celery_app import celery_app, get_setup_utils +from helpers.config import get_settings +import asyncio +from models.ProjectModel import ProjectModel +from models.ChunkModel import ChunkModel +from models.AssetModel import AssetModel +from models.db_schemes import DataChunk +from models import ResponseSignal +from models.enums.AssetTypeEnum import AssetTypeEnum +from controllers import ProcessController +from controllers import NLPController + +import logging +logger = logging.getLogger(__name__) + +@celery_app.task( + bind=True, name="tasks.file_processing.process_project_files", + autoretry_for=(Exception,), + retry_kwargs={'max_retries': 3, 'countdown': 60} + ) +def process_project_files(self, project_id: int, + file_id: int, chunk_size: int, + overlap_size: int, do_reset: int): + + asyncio.run( + _process_project_files(self, project_id, file_id, chunk_size, + overlap_size, do_reset) + ) + + +async def _process_project_files(task_instance, project_id: int, + file_id: int, chunk_size: int, + overlap_size: int, do_reset: int): + + + db_engine, vectordb_client = None, None + + try: + + (db_engine, db_client, llm_provider_factory, + vectordb_provider_factory, + generation_client, embedding_client, + vectordb_client, template_parser) = await get_setup_utils() + + project_model = await ProjectModel.create_instance( + db_client=db_client + ) + + project = await project_model.get_project_or_create_one( + project_id=project_id + ) + + nlp_controller = NLPController( + vectordb_client=vectordb_client, + generation_client=generation_client, + embedding_client=embedding_client, + template_parser=template_parser, + ) + + asset_model = await AssetModel.create_instance( + db_client=db_client + ) + + project_files_ids = {} + if file_id: + asset_record = await asset_model.get_asset_record( + asset_project_id=project.project_id, + asset_name=file_id + ) + + if asset_record is None: + task_instance.update_state( + state="FAILURE", + meta={ + "signal": ResponseSignal.FILE_ID_ERROR.value, + } + ) + + raise Exception(f"No assets for file: {file_id}") + + project_files_ids = { + asset_record.asset_id: asset_record.asset_name + } + + else: + + + project_files = await asset_model.get_all_project_assets( + asset_project_id=project.project_id, + asset_type=AssetTypeEnum.FILE.value, + ) + + project_files_ids = { + record.asset_id: record.asset_name + for record in project_files + } + + if len(project_files_ids) == 0: + + task_instance.update_state( + state="FAILURE", + meta={ + "signal": ResponseSignal.NO_FILES_ERROR.value, + } + ) + + raise Exception(f"No files found for project_id: {project.project_id}") + + process_controller = ProcessController(project_id=project_id) + + no_records = 0 + no_files = 0 + + chunk_model = await ChunkModel.create_instance( + db_client=db_client + ) + + if do_reset == 1: + # delete associated vectors collection + collection_name = nlp_controller.create_collection_name(project_id=project.project_id) + _ = await vectordb_client.delete_collection(collection_name=collection_name) + + # delete associated chunks + _ = await chunk_model.delete_chunks_by_project_id( + project_id=project.project_id + ) + + for asset_id, file_id in project_files_ids.items(): + + file_content = process_controller.get_file_content(file_id=file_id) + + if file_content is None: + logger.error(f"Error while processing file: {file_id}") + continue + + file_chunks = process_controller.process_file_content( + file_content=file_content, + file_id=file_id, + chunk_size=chunk_size, + overlap_size=overlap_size + ) + + if file_chunks is None or len(file_chunks) == 0: + + logger.error(f"No chunks for file_id: {file_id}") + pass + + file_chunks_records = [ + DataChunk( + chunk_text=chunk.page_content, + chunk_metadata=chunk.metadata, + chunk_order=i+1, + chunk_project_id=project.project_id, + chunk_asset_id=asset_id + ) + for i, chunk in enumerate(file_chunks) + ] + + no_records += await chunk_model.insert_many_chunks(chunks=file_chunks_records) + no_files += 1 + + task_instance.update_state( + state="SUCCESS", + meta={ + "signal": ResponseSignal.NO_FILES_ERROR.value, + } + ) + + logger.warning(f"inserted_chunks: {no_records}") + + return { + "signal": ResponseSignal.PROCESSING_SUCCESS.value, + "inserted_chunks": no_records, + "processed_files": no_files + } + + except Exception as e: + logger.error(f"Task failed: {str(e)}") + raise + finally: + try: + if db_engine: + await db_engine.dispose() + + if vectordb_client: + await vectordb_client.disconnect() + except Exception as e: + logger.error(f"Task failed while cleaning: {str(e)}") \ No newline at end of file From 6a1d9e870118d931d850dfa2606059e63c5ffba9 Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Sun, 3 Aug 2025 20:18:12 +0300 Subject: [PATCH 50/65] setup flower dashbaord --- README.md | 21 +++++++++++++++++++++ src/.env.example | 1 + src/flowerconfig.py | 12 ++++++++++++ src/helpers/config.py | 1 + src/requirements.txt | 1 + src/tasks/file_processing.py | 4 ++-- 6 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 src/flowerconfig.py diff --git a/README.md b/README.md index 69e741fc..9bf29c96 100644 --- a/README.md +++ b/README.md @@ -108,6 +108,27 @@ $ sudo docker compose up -d $ uvicorn main:app --reload --host 0.0.0.0 --port 5000 ``` +# Celery + +To Run the Celery worker, you need to run the following command in a separate terminal: + +```bash +$ python -m celery -A celery_app worker --queues=default,file_processing +--loglevel=info +``` + +To Run **Flower Dashboard**, you can run the following command in a separate terminal: + +```bash +$ python -m celery -A celery_app flower --conf=flowerconfig.py +``` + +```bash +python -m celery -A celery_app flower --conf=flowerconfig.py +``` + +open your browser and go to `http://localhost:5555` to see the dashboard. + ## POSTMAN Collection Download the POSTMAN collection from [/assets/mini-rag-app.postman_collection.json](/assets/mini-rag-app.postman_collection.json) diff --git a/src/.env.example b/src/.env.example index af2d102d..40209588 100644 --- a/src/.env.example +++ b/src/.env.example @@ -47,3 +47,4 @@ CELERY_TASK_SERIALIZER="json" CELERY_TASK_TIME_LIMIT=600 CELERY_TASK_ACKS_LATE=false CELERY_WORKER_CONCURRENCY=2 +CELERY_FLOWER_PASSWORD="minirag_flower_2222" diff --git a/src/flowerconfig.py b/src/flowerconfig.py new file mode 100644 index 00000000..5bebec51 --- /dev/null +++ b/src/flowerconfig.py @@ -0,0 +1,12 @@ +from dotenv import dotenv_values +config = dotenv_values(".env") + +# Flower configuration +port = 5555 +max_tasks = 10000 +# db = 'flower.db' # SQLite database for persistent storage +auto_refresh = True + +# Authentication (optional) +basic_auth = [f'admin:{config["CELERY_FLOWER_PASSWORD"]}'] + diff --git a/src/helpers/config.py b/src/helpers/config.py index 3f4eac1b..fb8600b8 100644 --- a/src/helpers/config.py +++ b/src/helpers/config.py @@ -48,6 +48,7 @@ class Settings(BaseSettings): CELERY_TASK_TIME_LIMIT: int = 600 CELERY_TASK_ACKS_LATE: bool = True CELERY_WORKER_CONCURRENCY: int = 2 + CELERY_FLOWER_PASSWORD: str = None class Config: env_file = ".env" diff --git a/src/requirements.txt b/src/requirements.txt index 3b30cc93..cb2b5a90 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -29,3 +29,4 @@ redis==6.2.0 kombu==5.5.4 billiard==4.2.1 vine==5.1.0 +flower==2.0.1 \ No newline at end of file diff --git a/src/tasks/file_processing.py b/src/tasks/file_processing.py index 855390c2..0ac6b132 100644 --- a/src/tasks/file_processing.py +++ b/src/tasks/file_processing.py @@ -22,7 +22,7 @@ def process_project_files(self, project_id: int, file_id: int, chunk_size: int, overlap_size: int, do_reset: int): - asyncio.run( + return asyncio.run( _process_project_files(self, project_id, file_id, chunk_size, overlap_size, do_reset) ) @@ -162,7 +162,7 @@ async def _process_project_files(task_instance, project_id: int, task_instance.update_state( state="SUCCESS", meta={ - "signal": ResponseSignal.NO_FILES_ERROR.value, + "signal": ResponseSignal.PROCESSING_SUCCESS.value, } ) From 256af03cf2a1885b3d7821af12685b5912871d9d Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Sun, 3 Aug 2025 20:20:09 +0300 Subject: [PATCH 51/65] update celery instructions + remove unused codes --- README.md | 3 +- src/routes/data.py | 130 --------------------------------------------- 2 files changed, 1 insertion(+), 132 deletions(-) diff --git a/README.md b/README.md index 9bf29c96..1dff16d9 100644 --- a/README.md +++ b/README.md @@ -113,8 +113,7 @@ $ uvicorn main:app --reload --host 0.0.0.0 --port 5000 To Run the Celery worker, you need to run the following command in a separate terminal: ```bash -$ python -m celery -A celery_app worker --queues=default,file_processing ---loglevel=info +$ python -m celery -A celery_app worker --queues=default,file_processing --loglevel=info ``` To Run **Flower Dashboard**, you can run the following command in a separate terminal: diff --git a/src/routes/data.py b/src/routes/data.py index e849cc3e..f1ce8b2e 100644 --- a/src/routes/data.py +++ b/src/routes/data.py @@ -111,133 +111,3 @@ async def process_endpoint(request: Request, project_id: int, process_request: P "task_id": task.id } ) - - - - - - - - - project_model = await ProjectModel.create_instance( - db_client=request.app.db_client - ) - - project = await project_model.get_project_or_create_one( - project_id=project_id - ) - - nlp_controller = NLPController( - vectordb_client=request.app.vectordb_client, - generation_client=request.app.generation_client, - embedding_client=request.app.embedding_client, - template_parser=request.app.template_parser, - ) - - asset_model = await AssetModel.create_instance( - db_client=request.app.db_client - ) - - project_files_ids = {} - if process_request.file_id: - asset_record = await asset_model.get_asset_record( - asset_project_id=project.project_id, - asset_name=process_request.file_id - ) - - if asset_record is None: - return JSONResponse( - status_code=status.HTTP_400_BAD_REQUEST, - content={ - "signal": ResponseSignal.FILE_ID_ERROR.value, - } - ) - - project_files_ids = { - asset_record.asset_id: asset_record.asset_name - } - - else: - - - project_files = await asset_model.get_all_project_assets( - asset_project_id=project.project_id, - asset_type=AssetTypeEnum.FILE.value, - ) - - project_files_ids = { - record.asset_id: record.asset_name - for record in project_files - } - - if len(project_files_ids) == 0: - return JSONResponse( - status_code=status.HTTP_400_BAD_REQUEST, - content={ - "signal": ResponseSignal.NO_FILES_ERROR.value, - } - ) - - process_controller = ProcessController(project_id=project_id) - - no_records = 0 - no_files = 0 - - chunk_model = await ChunkModel.create_instance( - db_client=request.app.db_client - ) - - if do_reset == 1: - # delete associated vectors collection - collection_name = nlp_controller.create_collection_name(project_id=project.project_id) - _ = await request.app.vectordb_client.delete_collection(collection_name=collection_name) - - # delete associated chunks - _ = await chunk_model.delete_chunks_by_project_id( - project_id=project.project_id - ) - - for asset_id, file_id in project_files_ids.items(): - - file_content = process_controller.get_file_content(file_id=file_id) - - if file_content is None: - logger.error(f"Error while processing file: {file_id}") - continue - - file_chunks = process_controller.process_file_content( - file_content=file_content, - file_id=file_id, - chunk_size=chunk_size, - overlap_size=overlap_size - ) - - if file_chunks is None or len(file_chunks) == 0: - return JSONResponse( - status_code=status.HTTP_400_BAD_REQUEST, - content={ - "signal": ResponseSignal.PROCESSING_FAILED.value - } - ) - - file_chunks_records = [ - DataChunk( - chunk_text=chunk.page_content, - chunk_metadata=chunk.metadata, - chunk_order=i+1, - chunk_project_id=project.project_id, - chunk_asset_id=asset_id - ) - for i, chunk in enumerate(file_chunks) - ] - - no_records += await chunk_model.insert_many_chunks(chunks=file_chunks_records) - no_files += 1 - - return JSONResponse( - content={ - "signal": ResponseSignal.PROCESSING_SUCCESS.value, - "inserted_chunks": no_records, - "processed_files": no_files - } - ) From 2891c8d32f350d1f896a2201eb996cf7e08f5860 Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Sun, 3 Aug 2025 20:47:08 +0300 Subject: [PATCH 52/65] push data/index push to celery --- src/celery_app.py | 6 +- src/models/enums/ResponseEnums.py | 1 + src/routes/nlp.py | 81 ++--------------- src/tasks/data_indexing.py | 143 ++++++++++++++++++++++++++++++ 4 files changed, 155 insertions(+), 76 deletions(-) create mode 100644 src/tasks/data_indexing.py diff --git a/src/celery_app.py b/src/celery_app.py index 64f4f004..82a51417 100644 --- a/src/celery_app.py +++ b/src/celery_app.py @@ -51,7 +51,8 @@ async def get_setup_utils(): broker=settings.CELERY_BROKER_URL, backend=settings.CELERY_RESULT_BACKEND, include=[ - "tasks.file_processing" + "tasks.file_processing", + "tasks.data_indexing" ] ) @@ -83,7 +84,8 @@ async def get_setup_utils(): worker_cancel_long_running_tasks_on_connection_loss=True, task_routes={ - "tasks.file_processing.process_project_files": {"queue": "file_processing"} + "tasks.file_processing.process_project_files": {"queue": "file_processing"}, + "tasks.data_indexing.index_data_content": {"queue": "data_indexing"}, } ) diff --git a/src/models/enums/ResponseEnums.py b/src/models/enums/ResponseEnums.py index 5869fcad..2c45c32c 100644 --- a/src/models/enums/ResponseEnums.py +++ b/src/models/enums/ResponseEnums.py @@ -19,4 +19,5 @@ class ResponseSignal(Enum): VECTORDB_SEARCH_SUCCESS = "vectordb_search_success" RAG_ANSWER_ERROR = "rag_answer_error" RAG_ANSWER_SUCCESS = "rag_answer_success" + DATA_PUSH_TASK_READY="data_push_task_ready" \ No newline at end of file diff --git a/src/routes/nlp.py b/src/routes/nlp.py index 2cfa159f..ccd273e7 100644 --- a/src/routes/nlp.py +++ b/src/routes/nlp.py @@ -6,6 +6,7 @@ from controllers import NLPController from models import ResponseSignal from tqdm.auto import tqdm +from tasks.data_indexing import index_data_content import logging @@ -19,86 +20,18 @@ @nlp_router.post("/index/push/{project_id}") async def index_project(request: Request, project_id: int, push_request: PushRequest): - project_model = await ProjectModel.create_instance( - db_client=request.app.db_client - ) - - chunk_model = await ChunkModel.create_instance( - db_client=request.app.db_client - ) - - project = await project_model.get_project_or_create_one( - project_id=project_id + task = index_data_content.delay( + project_id=project_id, + do_reset=push_request.do_reset ) - if not project: - return JSONResponse( - status_code=status.HTTP_400_BAD_REQUEST, - content={ - "signal": ResponseSignal.PROJECT_NOT_FOUND_ERROR.value - } - ) - - nlp_controller = NLPController( - vectordb_client=request.app.vectordb_client, - generation_client=request.app.generation_client, - embedding_client=request.app.embedding_client, - template_parser=request.app.template_parser, - ) - - has_records = True - page_no = 1 - inserted_items_count = 0 - idx = 0 - - # create collection if not exists - collection_name = nlp_controller.create_collection_name(project_id=project.project_id) - - _ = await request.app.vectordb_client.create_collection( - collection_name=collection_name, - embedding_size=request.app.embedding_client.embedding_size, - do_reset=push_request.do_reset, - ) - - # setup batching - total_chunks_count = await chunk_model.get_total_chunks_count(project_id=project.project_id) - pbar = tqdm(total=total_chunks_count, desc="Vector Indexing", position=0) - - while has_records: - page_chunks = await chunk_model.get_poject_chunks(project_id=project.project_id, page_no=page_no) - if len(page_chunks): - page_no += 1 - - if not page_chunks or len(page_chunks) == 0: - has_records = False - break - - chunks_ids = [ c.chunk_id for c in page_chunks ] - idx += len(page_chunks) - - is_inserted = await nlp_controller.index_into_vector_db( - project=project, - chunks=page_chunks, - chunks_ids=chunks_ids - ) - - if not is_inserted: - return JSONResponse( - status_code=status.HTTP_400_BAD_REQUEST, - content={ - "signal": ResponseSignal.INSERT_INTO_VECTORDB_ERROR.value - } - ) - - pbar.update(len(page_chunks)) - inserted_items_count += len(page_chunks) - return JSONResponse( content={ - "signal": ResponseSignal.INSERT_INTO_VECTORDB_SUCCESS.value, - "inserted_items_count": inserted_items_count + "signal": ResponseSignal.DATA_PUSH_TASK_READY.value, + "task_id": task.id } ) + @nlp_router.get("/index/info/{project_id}") async def get_project_index_info(request: Request, project_id: int): diff --git a/src/tasks/data_indexing.py b/src/tasks/data_indexing.py new file mode 100644 index 00000000..b3a66be4 --- /dev/null +++ b/src/tasks/data_indexing.py @@ -0,0 +1,143 @@ +from celery_app import celery_app, get_setup_utils +from helpers.config import get_settings +import asyncio +from fastapi.responses import JSONResponse +from models.ProjectModel import ProjectModel +from models.ChunkModel import ChunkModel +from controllers import NLPController +from models import ResponseSignal +from tqdm.auto import tqdm + +import logging +logger = logging.getLogger(__name__) + +@celery_app.task( + bind=True, name="tasks.data_indexing.index_data_content", + autoretry_for=(Exception,), + retry_kwargs={'max_retries': 3, 'countdown': 60} + ) +def index_data_content(self, project_id: int, do_reset: int): + + return asyncio.run( + _index_data_content(self, project_id, do_reset) + ) + +async def _index_data_content(task_instance, project_id: int, do_reset: int): + + db_engine, vectordb_client = None, None + + try: + + (db_engine, db_client, llm_provider_factory, + vectordb_provider_factory, + generation_client, embedding_client, + vectordb_client, template_parser) = await get_setup_utils() + + logger.warning("Setup utils were loaded!") + + project_model = await ProjectModel.create_instance( + db_client=db_client + ) + + chunk_model = await ChunkModel.create_instance( + db_client=db_client + ) + + project = await project_model.get_project_or_create_one( + project_id=project_id + ) + + if not project: + + task_instance.update_state( + state="FAILURE", + meta={ + "signal": ResponseSignal.PROJECT_NOT_FOUND_ERROR.value + } + ) + + raise Exception(f"No project found for project_id: {project_id}") + + nlp_controller = NLPController( + vectordb_client=vectordb_client, + generation_client=generation_client, + embedding_client=embedding_client, + template_parser=template_parser, + ) + + has_records = True + page_no = 1 + inserted_items_count = 0 + idx = 0 + + # create collection if not exists + collection_name = nlp_controller.create_collection_name(project_id=project.project_id) + + _ = await vectordb_client.create_collection( + collection_name=collection_name, + embedding_size=embedding_client.embedding_size, + do_reset=do_reset, + ) + + # setup batching + total_chunks_count = await chunk_model.get_total_chunks_count(project_id=project.project_id) + pbar = tqdm(total=total_chunks_count, desc="Vector Indexing", position=0) + + while has_records: + page_chunks = await chunk_model.get_poject_chunks(project_id=project.project_id, page_no=page_no) + if len(page_chunks): + page_no += 1 + + if not page_chunks or len(page_chunks) == 0: + has_records = False + break + + chunks_ids = [ c.chunk_id for c in page_chunks ] + idx += len(page_chunks) + + is_inserted = await nlp_controller.index_into_vector_db( + project=project, + chunks=page_chunks, + chunks_ids=chunks_ids + ) + + if not is_inserted: + + + task_instance.update_state( + state="FAILURE", + meta={ + "signal": ResponseSignal.INSERT_INTO_VECTORDB_ERROR.value + } + ) + + raise Exception(f"can not insert into vectorDB | project_id: {project_id}") + + pbar.update(len(page_chunks)) + inserted_items_count += len(page_chunks) + + + task_instance.update_state( + state="SUCCESS", + meta={ + "signal": ResponseSignal.INSERT_INTO_VECTORDB_SUCCESS.value, + } + ) + + return { + "signal": ResponseSignal.INSERT_INTO_VECTORDB_SUCCESS.value, + "inserted_items_count": inserted_items_count + } + + except Exception as e: + logger.error(f"Task failed: {str(e)}") + raise + finally: + try: + if db_engine: + await db_engine.dispose() + + if vectordb_client: + await vectordb_client.disconnect() + except Exception as e: + logger.error(f"Task failed while cleaning: {str(e)}") \ No newline at end of file From 38386c43130561c75d577779d08608c1e8a90a77 Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Sun, 3 Aug 2025 22:20:22 +0300 Subject: [PATCH 53/65] setup celery workflow --- src/celery_app.py | 4 ++- src/models/enums/ResponseEnums.py | 1 + src/routes/data.py | 23 +++++++++++++ src/tasks/data_indexing.py | 1 + src/tasks/file_processing.py | 4 ++- src/tasks/process_workflow.py | 54 +++++++++++++++++++++++++++++++ 6 files changed, 85 insertions(+), 2 deletions(-) create mode 100644 src/tasks/process_workflow.py diff --git a/src/celery_app.py b/src/celery_app.py index 82a51417..bb438f62 100644 --- a/src/celery_app.py +++ b/src/celery_app.py @@ -52,7 +52,8 @@ async def get_setup_utils(): backend=settings.CELERY_RESULT_BACKEND, include=[ "tasks.file_processing", - "tasks.data_indexing" + "tasks.data_indexing", + "tasks.process_workflow" ] ) @@ -86,6 +87,7 @@ async def get_setup_utils(): task_routes={ "tasks.file_processing.process_project_files": {"queue": "file_processing"}, "tasks.data_indexing.index_data_content": {"queue": "data_indexing"}, + "tasks.process_workflow.process_and_push_workflow": {"queue": "file_processing"}, } ) diff --git a/src/models/enums/ResponseEnums.py b/src/models/enums/ResponseEnums.py index 2c45c32c..aa46a092 100644 --- a/src/models/enums/ResponseEnums.py +++ b/src/models/enums/ResponseEnums.py @@ -20,4 +20,5 @@ class ResponseSignal(Enum): RAG_ANSWER_ERROR = "rag_answer_error" RAG_ANSWER_SUCCESS = "rag_answer_success" DATA_PUSH_TASK_READY="data_push_task_ready" + PROCESS_AND_PUSH_WORKFLOW_READY="process_and_push_workflow_ready" \ No newline at end of file diff --git a/src/routes/data.py b/src/routes/data.py index f1ce8b2e..b419c688 100644 --- a/src/routes/data.py +++ b/src/routes/data.py @@ -14,6 +14,7 @@ from models.enums.AssetTypeEnum import AssetTypeEnum from controllers import NLPController from tasks.file_processing import process_project_files +from tasks.process_workflow import process_and_push_workflow logger = logging.getLogger('uvicorn.error') @@ -111,3 +112,25 @@ async def process_endpoint(request: Request, project_id: int, process_request: P "task_id": task.id } ) + +@data_router.post("/process-and-push/{project_id}") +async def process_and_push_endpoint(request: Request, project_id: int, process_request: ProcessRequest): + + chunk_size = process_request.chunk_size + overlap_size = process_request.overlap_size + do_reset = process_request.do_reset + + workflow_task = process_and_push_workflow.delay( + project_id=project_id, + file_id=process_request.file_id, + chunk_size=chunk_size, + overlap_size=overlap_size, + do_reset=do_reset, + ) + + return JSONResponse( + content={ + "signal": ResponseSignal.PROCESS_AND_PUSH_WORKFLOW_READY.value, + "workflow_task_id": workflow_task.id + } + ) diff --git a/src/tasks/data_indexing.py b/src/tasks/data_indexing.py index b3a66be4..a93b5f36 100644 --- a/src/tasks/data_indexing.py +++ b/src/tasks/data_indexing.py @@ -18,6 +18,7 @@ ) def index_data_content(self, project_id: int, do_reset: int): + logger.warning("index_data_content started") return asyncio.run( _index_data_content(self, project_id, do_reset) ) diff --git a/src/tasks/file_processing.py b/src/tasks/file_processing.py index 0ac6b132..54aef802 100644 --- a/src/tasks/file_processing.py +++ b/src/tasks/file_processing.py @@ -171,7 +171,9 @@ async def _process_project_files(task_instance, project_id: int, return { "signal": ResponseSignal.PROCESSING_SUCCESS.value, "inserted_chunks": no_records, - "processed_files": no_files + "processed_files": no_files, + "project_id": project_id, + "do_reset": do_reset } except Exception as e: diff --git a/src/tasks/process_workflow.py b/src/tasks/process_workflow.py new file mode 100644 index 00000000..aa77b61f --- /dev/null +++ b/src/tasks/process_workflow.py @@ -0,0 +1,54 @@ +from celery import chain +from celery_app import celery_app, get_setup_utils +from helpers.config import get_settings +import asyncio +from tasks.file_processing import process_project_files +from tasks.data_indexing import _index_data_content + +import logging +logger = logging.getLogger(__name__) + +@celery_app.task( + bind=True, name="tasks.process_workflow.push_after_process_task", + autoretry_for=(Exception,), + retry_kwargs={'max_retries': 3, 'countdown': 60} + ) +def push_after_process_task(self, prev_task_result): + + project_id = prev_task_result.get("project_id") + do_reset = prev_task_result.get("do_reset") + + task_results = asyncio.run( + _index_data_content(self, project_id, do_reset) + ) + + return { + "project_id": project_id, + "do_reset": do_reset, + "task_results": task_results + } + + +@celery_app.task( + bind=True, name="tasks.process_workflow.process_and_push_workflow", + autoretry_for=(Exception,), + retry_kwargs={'max_retries': 3, 'countdown': 60} + ) +def process_and_push_workflow( self, project_id: int, + file_id: int, chunk_size: int, + overlap_size: int, do_reset: int): + + workflow = chain( + process_project_files.s(project_id, file_id, chunk_size, overlap_size, do_reset), + push_after_process_task.s() + ) + + result = workflow.apply_async() + + return { + "signal": "WORKFLOW_STARTED", + "workflow_id": result.id, + "tasks": ["tasks.file_processing.process_project_files", + "tasks.data_indexing.index_data_content"] + } + From 49efd69c2188b44674b13df548a8dd80608ad725 Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Sun, 3 Aug 2025 22:20:34 +0300 Subject: [PATCH 54/65] setup celery_task_executions database table --- ...09b_create_celery_task_executions_table.py | 51 +++++++++++++++++++ .../db_schemes/minirag/schemes/__init__.py | 1 + .../minirag/schemes/celery_task_execution.py | 34 +++++++++++++ 3 files changed, 86 insertions(+) create mode 100644 src/models/db_schemes/minirag/alembic/versions/b9f9e870b09b_create_celery_task_executions_table.py create mode 100644 src/models/db_schemes/minirag/schemes/celery_task_execution.py diff --git a/src/models/db_schemes/minirag/alembic/versions/b9f9e870b09b_create_celery_task_executions_table.py b/src/models/db_schemes/minirag/alembic/versions/b9f9e870b09b_create_celery_task_executions_table.py new file mode 100644 index 00000000..ad406e13 --- /dev/null +++ b/src/models/db_schemes/minirag/alembic/versions/b9f9e870b09b_create_celery_task_executions_table.py @@ -0,0 +1,51 @@ +"""create celery_task_executions table + +Revision ID: b9f9e870b09b +Revises: fee4cd54bd38 +Create Date: 2025-08-03 22:17:07.184977 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = 'b9f9e870b09b' +down_revision: Union[str, None] = 'fee4cd54bd38' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('celery_task_executions', + sa.Column('execution_id', sa.Integer(), autoincrement=True, nullable=False), + sa.Column('task_name', sa.String(length=255), nullable=False), + sa.Column('task_args_hash', sa.String(length=64), nullable=False), + sa.Column('celery_task_id', sa.UUID(), nullable=True), + sa.Column('status', sa.String(length=20), nullable=False), + sa.Column('task_args', postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column('result', postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column('started_at', sa.DateTime(timezone=True), nullable=True), + sa.Column('completed_at', sa.DateTime(timezone=True), nullable=True), + sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), + sa.Column('updated_at', sa.DateTime(timezone=True), nullable=True), + sa.PrimaryKeyConstraint('execution_id') + ) + op.create_index('ixz_celery_task_id', 'celery_task_executions', ['celery_task_id'], unique=False) + op.create_index('ixz_task_execution_created_at', 'celery_task_executions', ['created_at'], unique=False) + op.create_index('ixz_task_execution_status', 'celery_task_executions', ['status'], unique=False) + op.create_index('ixz_task_name_args_hash', 'celery_task_executions', ['task_name', 'task_args_hash'], unique=True) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index('ixz_task_name_args_hash', table_name='celery_task_executions') + op.drop_index('ixz_task_execution_status', table_name='celery_task_executions') + op.drop_index('ixz_task_execution_created_at', table_name='celery_task_executions') + op.drop_index('ixz_celery_task_id', table_name='celery_task_executions') + op.drop_table('celery_task_executions') + # ### end Alembic commands ### diff --git a/src/models/db_schemes/minirag/schemes/__init__.py b/src/models/db_schemes/minirag/schemes/__init__.py index 2a6cc0d0..478a864d 100644 --- a/src/models/db_schemes/minirag/schemes/__init__.py +++ b/src/models/db_schemes/minirag/schemes/__init__.py @@ -2,3 +2,4 @@ from .asset import Asset from .project import Project from .datachunk import DataChunk, RetrievedDocument +from .celery_task_execution import CeleryTaskExecution diff --git a/src/models/db_schemes/minirag/schemes/celery_task_execution.py b/src/models/db_schemes/minirag/schemes/celery_task_execution.py new file mode 100644 index 00000000..c33a7d07 --- /dev/null +++ b/src/models/db_schemes/minirag/schemes/celery_task_execution.py @@ -0,0 +1,34 @@ +from .minirag_base import SQLAlchemyBase +from sqlalchemy import Column, Integer, DateTime, func, String, Text +from sqlalchemy.dialects.postgresql import UUID, JSONB +from sqlalchemy import Index +import uuid + +class CeleryTaskExecution(SQLAlchemyBase): + + __tablename__ = "celery_task_executions" + + execution_id = Column(Integer, primary_key=True, autoincrement=True) + + task_name = Column(String(255), nullable=False) + task_args_hash = Column(String(64), nullable=False) # SHA-256 hash of task arguments + celery_task_id = Column(UUID(as_uuid=True), nullable=True) + + status = Column(String(20), nullable=False, default='PENDING') + + task_args = Column(JSONB, nullable=True) + result = Column(JSONB, nullable=True) + + started_at = Column(DateTime(timezone=True), nullable=True) + completed_at = Column(DateTime(timezone=True), nullable=True) + created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False) + updated_at = Column(DateTime(timezone=True), onupdate=func.now(), nullable=True) + + __table_args__ = ( + Index('ixz_task_name_args_hash', task_name, task_args_hash, unique=True), + Index('ixz_task_execution_status', status), + Index('ixz_task_execution_created_at', created_at), + Index('ixz_celery_task_id', celery_task_id), + ) + + From 95ec8cd9e52513037e51bb6c542a057f66c3b6b6 Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Sun, 3 Aug 2025 23:57:05 +0300 Subject: [PATCH 55/65] setup celery beat ot clean up database celery executions table --- README.md | 8 +- src/.gitignore | 3 + src/celery_app.py | 16 ++- ...b0_update_celery_task_executions_table_.py | 30 +++++ .../minirag/schemes/celery_task_execution.py | 2 +- src/tasks/file_processing.py | 72 ++++++++++ src/tasks/maintenance.py | 50 +++++++ src/utils/idempotency_manager.py | 124 ++++++++++++++++++ 8 files changed, 301 insertions(+), 4 deletions(-) create mode 100644 src/models/db_schemes/minirag/alembic/versions/243ca8b683b0_update_celery_task_executions_table_.py create mode 100644 src/tasks/maintenance.py create mode 100644 src/utils/idempotency_manager.py diff --git a/README.md b/README.md index 1dff16d9..3ebe30d3 100644 --- a/README.md +++ b/README.md @@ -110,12 +110,18 @@ $ uvicorn main:app --reload --host 0.0.0.0 --port 5000 # Celery -To Run the Celery worker, you need to run the following command in a separate terminal: +To Run the **Celery worker**, you need to run the following command in a separate terminal: ```bash $ python -m celery -A celery_app worker --queues=default,file_processing --loglevel=info ``` +To run the **Beat scheduler**, you can run the following command in a separate terminal: + +```bash +$ python -m celery -A celery_app beat --loglevel=info +``` + To Run **Flower Dashboard**, you can run the following command in a separate terminal: ```bash diff --git a/src/.gitignore b/src/.gitignore index 68bc17f9..3069115f 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -5,6 +5,9 @@ __pycache__/ # C extensions *.so +*.bak +*.dat +*.dir # Distribution / packaging .Python diff --git a/src/celery_app.py b/src/celery_app.py index bb438f62..4831ab8e 100644 --- a/src/celery_app.py +++ b/src/celery_app.py @@ -53,7 +53,8 @@ async def get_setup_utils(): include=[ "tasks.file_processing", "tasks.data_indexing", - "tasks.process_workflow" + "tasks.process_workflow", + "tasks.maintenance", ] ) @@ -88,7 +89,18 @@ async def get_setup_utils(): "tasks.file_processing.process_project_files": {"queue": "file_processing"}, "tasks.data_indexing.index_data_content": {"queue": "data_indexing"}, "tasks.process_workflow.process_and_push_workflow": {"queue": "file_processing"}, - } + "tasks.maintenance.clean_celery_executions_table": {"queue": "default"}, + }, + + beat_schedule={ + 'cleanup-old-task-records': { + 'task': "tasks.maintenance.clean_celery_executions_table", + 'schedule': 10, + 'args': () + } + }, + + timezone='UTC', ) diff --git a/src/models/db_schemes/minirag/alembic/versions/243ca8b683b0_update_celery_task_executions_table_.py b/src/models/db_schemes/minirag/alembic/versions/243ca8b683b0_update_celery_task_executions_table_.py new file mode 100644 index 00000000..bc504d12 --- /dev/null +++ b/src/models/db_schemes/minirag/alembic/versions/243ca8b683b0_update_celery_task_executions_table_.py @@ -0,0 +1,30 @@ +"""update celery_task_executions table indexes + +Revision ID: 243ca8b683b0 +Revises: b9f9e870b09b +Create Date: 2025-08-03 23:15:43.860171 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '243ca8b683b0' +down_revision: Union[str, None] = 'b9f9e870b09b' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_index('ixz_task_name_args_celery_hash', 'celery_task_executions', ['task_name', 'task_args_hash', 'celery_task_id'], unique=True) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index('ixz_task_name_args_celery_hash', table_name='celery_task_executions') + # ### end Alembic commands ### diff --git a/src/models/db_schemes/minirag/schemes/celery_task_execution.py b/src/models/db_schemes/minirag/schemes/celery_task_execution.py index c33a7d07..8ef63aa6 100644 --- a/src/models/db_schemes/minirag/schemes/celery_task_execution.py +++ b/src/models/db_schemes/minirag/schemes/celery_task_execution.py @@ -25,7 +25,7 @@ class CeleryTaskExecution(SQLAlchemyBase): updated_at = Column(DateTime(timezone=True), onupdate=func.now(), nullable=True) __table_args__ = ( - Index('ixz_task_name_args_hash', task_name, task_args_hash, unique=True), + Index('ixz_task_name_args_celery_hash', task_name, task_args_hash, celery_task_id, unique=True), Index('ixz_task_execution_status', status), Index('ixz_task_execution_created_at', created_at), Index('ixz_celery_task_id', celery_task_id), diff --git a/src/tasks/file_processing.py b/src/tasks/file_processing.py index 54aef802..361ce98c 100644 --- a/src/tasks/file_processing.py +++ b/src/tasks/file_processing.py @@ -9,6 +9,7 @@ from models.enums.AssetTypeEnum import AssetTypeEnum from controllers import ProcessController from controllers import NLPController +from utils.idempotency_manager import IdempotencyManager import logging logger = logging.getLogger(__name__) @@ -42,6 +43,57 @@ async def _process_project_files(task_instance, project_id: int, generation_client, embedding_client, vectordb_client, template_parser) = await get_setup_utils() + # Create idempotency manager + idempotency_manager = IdempotencyManager(db_client, db_engine) + + # Define task arguments for idempotency check + task_args = { + "project_id": project_id, + "file_id": file_id, + "chunk_size": chunk_size, + "overlap_size": overlap_size, + "do_reset": do_reset + } + + task_name = "tasks.file_processing.process_project_files" + + settings = get_settings() + + # Check if task should execute (600 seconds = 10 minutes timeout) + should_execute, existing_task = await idempotency_manager.should_execute_task( + task_name=task_name, + task_args=task_args, + celery_task_id=task_instance.request.id, + task_time_limit=settings.CELERY_TASK_TIME_LIMIT + ) + + if not should_execute: + logger.warning(f"Can not handle th task | status: {existing_task.status}") + return existing_task.result + + task_record = None + if existing_task: + # Update existing task with new celery task ID + await idempotency_manager.update_task_status( + execution_id=existing_task.execution_id, + status='PENDING' + ) + task_record = existing_task + else: + # Create new task record + task_record = await idempotency_manager.create_task_record( + task_name=task_name, + task_args=task_args, + celery_task_id=task_instance.request.id + ) + + # Update status to STARTED + await idempotency_manager.update_task_status( + execution_id=task_record.execution_id, + status='STARTED' + ) + + project_model = await ProjectModel.create_instance( db_client=db_client ) @@ -76,6 +128,13 @@ async def _process_project_files(task_instance, project_id: int, } ) + # Update task status to FAILURE + await idempotency_manager.update_task_status( + execution_id=task_record.execution_id, + status='FAILURE', + result={"signal": ResponseSignal.FILE_ID_ERROR.value} + ) + raise Exception(f"No assets for file: {file_id}") project_files_ids = { @@ -104,6 +163,13 @@ async def _process_project_files(task_instance, project_id: int, } ) + # Update task status to FAILURE + await idempotency_manager.update_task_status( + execution_id=task_record.execution_id, + status='FAILURE', + result={"signal": ResponseSignal.NO_FILES_ERROR.value,} + ) + raise Exception(f"No files found for project_id: {project.project_id}") process_controller = ProcessController(project_id=project_id) @@ -166,6 +232,12 @@ async def _process_project_files(task_instance, project_id: int, } ) + await idempotency_manager.update_task_status( + execution_id=task_record.execution_id, + status='SUCCESS', + result={"signal": ResponseSignal.PROCESSING_SUCCESS.value} + ) + logger.warning(f"inserted_chunks: {no_records}") return { diff --git a/src/tasks/maintenance.py b/src/tasks/maintenance.py new file mode 100644 index 00000000..6b2e94ea --- /dev/null +++ b/src/tasks/maintenance.py @@ -0,0 +1,50 @@ +from celery_app import celery_app, get_setup_utils +from helpers.config import get_settings +import asyncio +from utils.idempotency_manager import IdempotencyManager + +import logging +logger = logging.getLogger(__name__) + +@celery_app.task( + bind=True, name="tasks.maintenance.clean_celery_executions_table", + autoretry_for=(Exception,), + retry_kwargs={'max_retries': 3, 'countdown': 60} + ) +def clean_celery_executions_table(self): + + return asyncio.run( + _clean_celery_executions_table(self) + ) + +async def _clean_celery_executions_table(task_instance): + + db_engine, vectordb_client = None, None + + try: + + (db_engine, db_client, llm_provider_factory, + vectordb_provider_factory, + generation_client, embedding_client, + vectordb_client, template_parser) = await get_setup_utils() + + # Create idempotency manager + idempotency_manager = IdempotencyManager(db_client, db_engine) + + logger.warning(f"cleaning !!!") + _ = await idempotency_manager.cleanup_old_tasks(5) + + return True + + except Exception as e: + logger.error(f"Task failed: {str(e)}") + raise + finally: + try: + if db_engine: + await db_engine.dispose() + + if vectordb_client: + await vectordb_client.disconnect() + except Exception as e: + logger.error(f"Task failed while cleaning: {str(e)}") \ No newline at end of file diff --git a/src/utils/idempotency_manager.py b/src/utils/idempotency_manager.py new file mode 100644 index 00000000..127a14de --- /dev/null +++ b/src/utils/idempotency_manager.py @@ -0,0 +1,124 @@ +import hashlib +import json +from datetime import datetime, timedelta, timezone +from sqlalchemy import select, delete +from models.db_schemes.minirag.schemes.celery_task_execution import CeleryTaskExecution + +class IdempotencyManager: + + def __init__(self, db_client, db_engine): + self.db_client = db_client + self.db_engine = db_engine + + def create_args_hash(self, task_name: str, task_args: dict): + combined_data = { + **task_args, + "task_name": task_name + } + json_string = json.dumps(combined_data, sort_keys=True, default=str) + return hashlib.sha256(json_string.encode()).hexdigest() + + async def create_task_record(self, task_name: str, task_args: dict, celery_task_id: str = None) -> CeleryTaskExecution: + """Create new task execution record.""" + args_hash = self.create_args_hash(task_name, task_args) + + task_record = CeleryTaskExecution( + task_name=task_name, + task_args_hash=args_hash, + task_args=task_args, + celery_task_id=celery_task_id, + status='PENDING', + started_at=datetime.utcnow() + ) + + session = self.db_client() + try: + session.add(task_record) + await session.commit() + await session.refresh(task_record) + return task_record + finally: + await session.close() + + async def update_task_status(self, execution_id: int, status: str, result: dict = None): + """Update task status and result.""" + session = self.db_client() + try: + task_record = await session.get(CeleryTaskExecution, execution_id) + if task_record: + task_record.status = status + if result: + task_record.result = result + if status in ['SUCCESS', 'FAILURE']: + task_record.completed_at = datetime.utcnow() + await session.commit() + finally: + await session.close() + + async def get_existing_task(self, task_name: str, + task_args: dict, celery_task_id: str) -> CeleryTaskExecution: + """Check if task with same name and args already exists.""" + args_hash = self.create_args_hash(task_name, task_args) + + session = self.db_client() + try: + stmt = select(CeleryTaskExecution).where( + CeleryTaskExecution.celery_task_id == celery_task_id, + CeleryTaskExecution.task_name == task_name, + CeleryTaskExecution.task_args_hash == args_hash + ) + result = await session.execute(stmt) + return result.scalar_one_or_none() + finally: + await session.close() + + async def should_execute_task(self, task_name: str, task_args: dict, + celery_task_id: str, + task_time_limit: int = 600) -> tuple[bool, CeleryTaskExecution]: + """ + Check if task should be executed or return existing result. + Args: + task_time_limit: Time limit in seconds after which a stuck task can be re-executed + Returns (should_execute, existing_task_or_none) + """ + existing_task = await self.get_existing_task(task_name, task_args, celery_task_id) + + if not existing_task: + return True, None + + # Don't execute if task is already completed successfully + if existing_task.status == 'SUCCESS': + return False, existing_task + + # Check if task is stuck (running longer than time limit + 60 seconds) + if existing_task.status in ['PENDING', 'STARTED', 'RETRY']: + if existing_task.started_at: + time_elapsed = (datetime.utcnow() - existing_task.started_at).total_seconds() + time_gap = 60 # 60 seconds grace period + if time_elapsed > (task_time_limit + time_gap): + return True, existing_task # Task is stuck, allow re-execution + return False, existing_task # Task is still running within time limit + + # Re-execute if previous task failed + return True, existing_task + + async def cleanup_old_tasks(self, time_retention: int = 86400) -> int: + """ + Delete old task records older than time_retention seconds. + Args: + time_retention: Time in seconds to retain tasks (default: 86400 = 24 hours) + Returns: + Number of deleted records + """ + cutoff_time = datetime.now(timezone.utc) - timedelta(seconds=time_retention) + + session = self.db_client() + try: + stmt = delete(CeleryTaskExecution).where( + CeleryTaskExecution.created_at < cutoff_time + ) + result = await session.execute(stmt) + await session.commit() + return result.rowcount + finally: + await session.close() \ No newline at end of file From 914d031277a528684d453906fb8c63a2daa79c55 Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Mon, 4 Aug 2025 14:37:34 +0300 Subject: [PATCH 56/65] setup celery docker services --- README.md | 18 +++++++---- docker/docker-compose.yml | 62 +++++++++++++++++++++++++++++++++++++ docker/env/.env.example.app | 9 ++++++ 3 files changed, 83 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 3ebe30d3..51b9b010 100644 --- a/README.md +++ b/README.md @@ -102,18 +102,27 @@ $ cd docker $ sudo docker compose up -d ``` -## Run the FastAPI server +## Access Services + +- **FastAPI**: http://localhost:8000 +- **Flower Dashboard**: http://localhost:5555 (admin/password from env) +- **Grafana**: http://localhost:3000 +- **Prometheus**: http://localhost:9090 + +## Run the FastAPI server (Development Mode) ```bash $ uvicorn main:app --reload --host 0.0.0.0 --port 5000 ``` -# Celery +# Celery (Development Mode) + +For development, you can run Celery services manually instead of using Docker: To Run the **Celery worker**, you need to run the following command in a separate terminal: ```bash -$ python -m celery -A celery_app worker --queues=default,file_processing --loglevel=info +$ python -m celery -A celery_app worker --queues=default,file_processing,data_indexing --loglevel=info ``` To run the **Beat scheduler**, you can run the following command in a separate terminal: @@ -128,9 +137,6 @@ To Run **Flower Dashboard**, you can run the following command in a separate ter $ python -m celery -A celery_app flower --conf=flowerconfig.py ``` -```bash -python -m celery -A celery_app flower --conf=flowerconfig.py -``` open your browser and go to `http://localhost:5555` to see the dashboard. diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index dc010e25..8483a53b 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -18,6 +18,67 @@ services: env_file: - ./env/.env.app + # Celery Worker + celery-worker: + build: + context: .. + dockerfile: docker/minirag/Dockerfile + container_name: celery-worker + volumes: + - fastapi_data:/app/assets + networks: + - backend + restart: always + depends_on: + rabbitmq: + condition: service_healthy + redis: + condition: service_healthy + pgvector: + condition: service_healthy + env_file: + - ./env/.env.app + command: ["python", "-m", "celery", "-A", "celery_app", "worker", "--queues=default,file_processing,data_indexing", "--loglevel=info"] + + # Celery Beat Scheduler + celery-beat: + build: + context: .. + dockerfile: docker/minirag/Dockerfile + container_name: celery-beat + volumes: + - fastapi_data:/app/assets + - celery_beat_data:/app/celerybeat + networks: + - backend + restart: always + depends_on: + rabbitmq: + condition: service_healthy + redis: + condition: service_healthy + env_file: + - ./env/.env.app + command: ["python", "-m", "celery", "-A", "celery_app", "beat", "--loglevel=info"] + + # Flower Dashboard + flower: + build: + context: .. + dockerfile: docker/minirag/Dockerfile + container_name: flower + ports: + - "5555:5555" + networks: + - backend + restart: always + depends_on: + - rabbitmq + - celery-worker + env_file: + - ./env/.env.app + command: ["python", "-m", "celery", "-A", "celery_app", "flower", "--conf=flowerconfig.py"] + # Nginx Service nginx: image: nginx:stable-alpine3.20-perl @@ -186,3 +247,4 @@ volumes: grafana_data: rabbitmq_data: redis_data: + celery_beat_data: diff --git a/docker/env/.env.example.app b/docker/env/.env.example.app index 4b9494ed..fa576a30 100644 --- a/docker/env/.env.example.app +++ b/docker/env/.env.example.app @@ -38,3 +38,12 @@ VECTOR_DB_PGVEC_INDEX_THRESHOLD = 100 # ========================= Template Config ========================= PRIMARY_LANG = "en" DEFAULT_LANG = "en" + +# ========================= Celery Task Queue Config ========================= +CELERY_BROKER_URL="amqp://minirag_user:minirag_rabbitmq_2222@localhost:5672/minirag_vhost" +CELERY_RESULT_BACKEND="redis://:minirag_redis_2222@localhost:6379/0" +CELERY_TASK_SERIALIZER="json" +CELERY_TASK_TIME_LIMIT=600 +CELERY_TASK_ACKS_LATE=false +CELERY_WORKER_CONCURRENCY=2 +CELERY_FLOWER_PASSWORD="minirag_flower_2222" \ No newline at end of file From 77050419ad3dd749702fdbea71e6181e376afcd3 Mon Sep 17 00:00:00 2001 From: Abu Bakr Soliman Date: Fri, 15 Aug 2025 08:41:12 +0300 Subject: [PATCH 57/65] update README file with new tutorials --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 51b9b010..d59c47f8 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,12 @@ This is an educational project where all of the codes where explained (step by s | 19 | Ollama Local LLM Server | [Video](https://youtu.be/-epZ1hAAtrs) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-012) | | 20 | From Mongo to Postgres + SQLAlchemy & Alembic | [Video](https://www.youtube.com/watch?v=BVOq7Ek2Up0) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-013) | | 21 | The way to PgVector | [Video](https://www.youtube.com/watch?v=g99yq5zlYAE) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-014) | +| 22 | App Deployments 1/2 | [Video](https://www.youtube.com/watch?v=7QRPnAbVssg) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-015) | +| 22 | App Deployments 2/2 | [Video](https://www.youtube.com/watch?v=qJ5Hdyc4hDc) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-015) | +| 24 | Celery Workers 1/2 | [Video](https://www.youtube.com/watch?v=pX-iWWT2TJo) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-016) | +| 25 | Celery Workers 2/2 | [Video](https://www.youtube.com/watch?v=SZ5Aznjf8Kc) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-017) | + + ## Requirements From ec8d87732895bdbcce3cad52d8a3cb3609f91c80 Mon Sep 17 00:00:00 2001 From: EssamShenhab Date: Sun, 25 Jan 2026 12:03:21 +0200 Subject: [PATCH 58/65] init project directory --- .github/.workflows/deploy-develop.yml | 46 --- .vscode/settings.json | 3 - docker/.gitignore | 3 - docker/README.md | 183 ----------- docker/docker-compose.yml | 250 -------------- docker/env/.env.example.app | 49 --- docker/env/.env.example.grafana | 4 - docker/env/.env.example.postgres | 4 - docker/env/.env.example.postgres-exporter | 3 - docker/env/.env.example.rabbitmq | 13 - docker/env/.env.example.redis | 12 - docker/minirag.service | 23 -- docker/minirag/Dockerfile | 35 -- docker/minirag/alembic.example.ini | 117 ------- docker/minirag/entrypoint.sh | 7 - docker/nginx/default.conf | 17 - docker/prometheus/prometheus.yml | 27 -- docker/rabbitmq/rabbitmq.conf | 18 -- src/.env.example | 50 --- src/.gitignore | 163 ---------- src/assets/.gitignore | 2 - src/assets/.gitkeep | 0 .../mini-rag-app.postman_collection.json | 57 ---- src/celery_app.py | 107 ------ src/controllers/BaseController.py | 35 -- src/controllers/DataController.py | 57 ---- src/controllers/NLPController.py | 141 -------- src/controllers/ProcessController.py | 109 ------- src/controllers/ProjectController.py | 22 -- src/controllers/__init__.py | 5 - src/flowerconfig.py | 12 - src/helpers/__init__.py | 0 src/helpers/config.py | 57 ---- src/main.py | 61 ---- src/models/AssetModel.py | 50 --- src/models/BaseDataModel.py | 7 - src/models/ChunkModel.py | 69 ---- src/models/ProjectModel.py | 61 ---- src/models/__init__.py | 3 - src/models/db_schemes/__init__.py | 1 - src/models/db_schemes/minirag/.gitignore | 1 - src/models/db_schemes/minirag/README.md | 21 -- src/models/db_schemes/minirag/__init__.py | 0 .../db_schemes/minirag/alembic.ini.example | 117 ------- src/models/db_schemes/minirag/alembic/README | 1 - src/models/db_schemes/minirag/alembic/env.py | 79 ----- .../db_schemes/minirag/alembic/script.py.mako | 26 -- ...b0_update_celery_task_executions_table_.py | 30 -- ...09b_create_celery_task_executions_table.py | 51 --- .../versions/fee4cd54bd38_initial_commit.py | 76 ----- .../db_schemes/minirag/schemes/__init__.py | 5 - .../db_schemes/minirag/schemes/asset.py | 32 -- .../minirag/schemes/celery_task_execution.py | 34 -- .../db_schemes/minirag/schemes/datachunk.py | 36 --- .../minirag/schemes/minirag_base.py | 2 - .../db_schemes/minirag/schemes/project.py | 18 -- src/models/enums/AssetTypeEnum.py | 6 - src/models/enums/DataBaseEnum.py | 8 - src/models/enums/ProcessingEnum.py | 6 - src/models/enums/ResponseEnums.py | 24 -- src/models/enums/__init__.py | 0 src/requirements.txt | 32 -- src/routes/__init__.py | 0 src/routes/base.py | 23 -- src/routes/data.py | 136 -------- src/routes/nlp.py | 139 -------- src/routes/schemes/__init__.py | 0 src/routes/schemes/data.py | 8 - src/routes/schemes/nlp.py | 9 - src/stores/llm/LLMEnums.py | 23 -- src/stores/llm/LLMInterface.py | 24 -- src/stores/llm/LLMProviderFactory.py | 27 -- src/stores/llm/__init__.py | 0 src/stores/llm/providers/CoHereProvider.py | 101 ------ src/stores/llm/providers/OpenAIProvider.py | 109 ------- src/stores/llm/providers/__init__.py | 2 - src/stores/llm/templates/__init__.py | 0 src/stores/llm/templates/locales/__init__.py | 0 .../llm/templates/locales/ar/__init__.py | 0 src/stores/llm/templates/locales/ar/rag.py | 33 -- .../llm/templates/locales/en/__init__.py | 0 src/stores/llm/templates/locales/en/rag.py | 33 -- src/stores/llm/templates/template_parser.py | 43 --- src/stores/vectordb/VectorDBEnums.py | 25 -- src/stores/vectordb/VectorDBInterface.py | 52 --- .../vectordb/VectorDBProviderFactory.py | 31 -- src/stores/vectordb/__init__.py | 0 .../vectordb/providers/PGVectorProvider.py | 306 ------------------ .../vectordb/providers/QdrantDBProvider.py | 152 --------- src/stores/vectordb/providers/__init__.py | 2 - src/tasks/__init__.py | 0 src/tasks/data_indexing.py | 144 --------- src/tasks/file_processing.py | 262 --------------- src/tasks/maintenance.py | 50 --- src/tasks/process_workflow.py | 54 ---- src/utils/__init__.py | 0 src/utils/idempotency_manager.py | 124 ------- src/utils/metrics.py | 36 --- 98 files changed, 4436 deletions(-) delete mode 100644 .github/.workflows/deploy-develop.yml delete mode 100644 .vscode/settings.json delete mode 100644 docker/.gitignore delete mode 100644 docker/README.md delete mode 100644 docker/docker-compose.yml delete mode 100644 docker/env/.env.example.app delete mode 100644 docker/env/.env.example.grafana delete mode 100644 docker/env/.env.example.postgres delete mode 100644 docker/env/.env.example.postgres-exporter delete mode 100644 docker/env/.env.example.rabbitmq delete mode 100644 docker/env/.env.example.redis delete mode 100644 docker/minirag.service delete mode 100644 docker/minirag/Dockerfile delete mode 100644 docker/minirag/alembic.example.ini delete mode 100644 docker/minirag/entrypoint.sh delete mode 100644 docker/nginx/default.conf delete mode 100644 docker/prometheus/prometheus.yml delete mode 100644 docker/rabbitmq/rabbitmq.conf delete mode 100644 src/.env.example delete mode 100644 src/.gitignore delete mode 100644 src/assets/.gitignore delete mode 100644 src/assets/.gitkeep delete mode 100644 src/assets/mini-rag-app.postman_collection.json delete mode 100644 src/celery_app.py delete mode 100644 src/controllers/BaseController.py delete mode 100644 src/controllers/DataController.py delete mode 100644 src/controllers/NLPController.py delete mode 100644 src/controllers/ProcessController.py delete mode 100644 src/controllers/ProjectController.py delete mode 100644 src/controllers/__init__.py delete mode 100644 src/flowerconfig.py delete mode 100644 src/helpers/__init__.py delete mode 100644 src/helpers/config.py delete mode 100644 src/main.py delete mode 100644 src/models/AssetModel.py delete mode 100644 src/models/BaseDataModel.py delete mode 100644 src/models/ChunkModel.py delete mode 100644 src/models/ProjectModel.py delete mode 100644 src/models/__init__.py delete mode 100644 src/models/db_schemes/__init__.py delete mode 100644 src/models/db_schemes/minirag/.gitignore delete mode 100644 src/models/db_schemes/minirag/README.md delete mode 100644 src/models/db_schemes/minirag/__init__.py delete mode 100644 src/models/db_schemes/minirag/alembic.ini.example delete mode 100644 src/models/db_schemes/minirag/alembic/README delete mode 100644 src/models/db_schemes/minirag/alembic/env.py delete mode 100644 src/models/db_schemes/minirag/alembic/script.py.mako delete mode 100644 src/models/db_schemes/minirag/alembic/versions/243ca8b683b0_update_celery_task_executions_table_.py delete mode 100644 src/models/db_schemes/minirag/alembic/versions/b9f9e870b09b_create_celery_task_executions_table.py delete mode 100644 src/models/db_schemes/minirag/alembic/versions/fee4cd54bd38_initial_commit.py delete mode 100644 src/models/db_schemes/minirag/schemes/__init__.py delete mode 100644 src/models/db_schemes/minirag/schemes/asset.py delete mode 100644 src/models/db_schemes/minirag/schemes/celery_task_execution.py delete mode 100644 src/models/db_schemes/minirag/schemes/datachunk.py delete mode 100644 src/models/db_schemes/minirag/schemes/minirag_base.py delete mode 100644 src/models/db_schemes/minirag/schemes/project.py delete mode 100644 src/models/enums/AssetTypeEnum.py delete mode 100644 src/models/enums/DataBaseEnum.py delete mode 100644 src/models/enums/ProcessingEnum.py delete mode 100644 src/models/enums/ResponseEnums.py delete mode 100644 src/models/enums/__init__.py delete mode 100644 src/requirements.txt delete mode 100644 src/routes/__init__.py delete mode 100644 src/routes/base.py delete mode 100644 src/routes/data.py delete mode 100644 src/routes/nlp.py delete mode 100644 src/routes/schemes/__init__.py delete mode 100644 src/routes/schemes/data.py delete mode 100644 src/routes/schemes/nlp.py delete mode 100644 src/stores/llm/LLMEnums.py delete mode 100644 src/stores/llm/LLMInterface.py delete mode 100644 src/stores/llm/LLMProviderFactory.py delete mode 100644 src/stores/llm/__init__.py delete mode 100644 src/stores/llm/providers/CoHereProvider.py delete mode 100644 src/stores/llm/providers/OpenAIProvider.py delete mode 100644 src/stores/llm/providers/__init__.py delete mode 100644 src/stores/llm/templates/__init__.py delete mode 100644 src/stores/llm/templates/locales/__init__.py delete mode 100644 src/stores/llm/templates/locales/ar/__init__.py delete mode 100644 src/stores/llm/templates/locales/ar/rag.py delete mode 100644 src/stores/llm/templates/locales/en/__init__.py delete mode 100644 src/stores/llm/templates/locales/en/rag.py delete mode 100644 src/stores/llm/templates/template_parser.py delete mode 100644 src/stores/vectordb/VectorDBEnums.py delete mode 100644 src/stores/vectordb/VectorDBInterface.py delete mode 100644 src/stores/vectordb/VectorDBProviderFactory.py delete mode 100644 src/stores/vectordb/__init__.py delete mode 100644 src/stores/vectordb/providers/PGVectorProvider.py delete mode 100644 src/stores/vectordb/providers/QdrantDBProvider.py delete mode 100644 src/stores/vectordb/providers/__init__.py delete mode 100644 src/tasks/__init__.py delete mode 100644 src/tasks/data_indexing.py delete mode 100644 src/tasks/file_processing.py delete mode 100644 src/tasks/maintenance.py delete mode 100644 src/tasks/process_workflow.py delete mode 100644 src/utils/__init__.py delete mode 100644 src/utils/idempotency_manager.py delete mode 100644 src/utils/metrics.py diff --git a/.github/.workflows/deploy-develop.yml b/.github/.workflows/deploy-develop.yml deleted file mode 100644 index f71ac894..00000000 --- a/.github/.workflows/deploy-develop.yml +++ /dev/null @@ -1,46 +0,0 @@ -name: Deploy Develop Branch to Server - -on: - push: - branches: - - develop - -jobs: - deploy: - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v4.2.2 - - - name: Deploy via SSH - uses: appleboy/ssh-action@v1.2.2 - with: - host: ${{ secrets.SSH_DEVELOP_HOST_IP }} - username: github_user - key: ${{ secrets.SSH_DEVELOP_PRIVATE_KEY }} - script: | - cd /home/github_user/workspace/mini-rag - git checkout develop - git pull - sudo systemctl restart minirag.service - echo "Waiting..." - sleep 20 - - for i in {1..6}; do - if ss -tuln | grep -q ':80'; then - echo "✅ Port 80 is now active." - break - else - echo "⏳ Port 80 not ready yet. Retrying in 5 seconds..." - sleep 5 - fi - done - - if ! ss -tuln | grep -q ':80'; then - echo "❌ Service failed to start on port 80" - exit 1 - fi - - - diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 99ff45e0..00000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "python.languageServer": "Pylance" -} \ No newline at end of file diff --git a/docker/.gitignore b/docker/.gitignore deleted file mode 100644 index 975aa757..00000000 --- a/docker/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -env/.env* -!env/.env.example.* -minirag/alembic.ini diff --git a/docker/README.md b/docker/README.md deleted file mode 100644 index fba41cad..00000000 --- a/docker/README.md +++ /dev/null @@ -1,183 +0,0 @@ -# Docker Setup for MiniRAG Application - -This directory contains the Docker setup for the MiniRAG application, including all necessary services for development and monitoring. - -## Services - -- **FastAPI Application**: Main application running on Uvicorn -- **Nginx**: Web server for serving the FastAPI application -- **PostgreSQL (pgvector)**: Vector-enabled database for storing embeddings -- **Postgres-Exporter**: Exports PostgreSQL metrics for Prometheus -- **Qdrant**: Vector database for similarity search -- **Prometheus**: Metrics collection -- **Grafana**: Visualization dashboard for metrics -- **Node-Exporter**: System metrics collection - -## Setup Instructions - -### 1. Set up environment files - -Create your environment files from the examples: - -```bash -# Create all required .env files from examples -cd docker/env -cp .env.example.app .env.app -cp .env.example.postgres .env.postgres -cp .env.example.grafana .env.grafana -cp .env.example.postgres-exporter .env.postgres-exporter - -# Setup the Alembic configuration for the FastAPI application -cd .. -cd docker/minirag -cp alembic.example.ini alembic.ini - -### 2. Start the services - -```bash -cd docker -docker compose up --build -d -``` - -To start only specific services: - -```bash -docker compose up -d fastapi nginx pgvector qdrant -``` - -If you encounter connection issues, you may want to start the database services first and let them initialize before starting the application: - -```bash -# Start databases first -docker compose up -d pgvector qdrant postgres-exporter -# Wait for databases to be healthy -sleep 30 -# Start the application services -docker compose up fastapi nginx prometheus grafana node-exporter --build -d -``` - -In case deleting all containers and volumes is necessary, you can run: - -```bash -docker compose down -v --remove-orphans -``` - -### 3. Access the services - -- FastAPI Application: http://localhost:8000 -- FastAPI Documentation: http://localhost:8000/docs -- Nginx (serving FastAPI): http://localhost -- Prometheus: http://localhost:9090 -- Grafana: http://localhost:3000 -- Qdrant UI: http://localhost:6333/dashboard - -## Volume Management - -### Managing Docker Volumes - -Docker volumes are used to persist data generated by and used by Docker containers. Here are some commands to manage your volumes: - -1. **List all volumes**: - ```bash - docker volume ls - ``` -2. **Inspect a volume**: - ```bash - docker volume inspect - ``` - - - list files in a volume: - ```bash - docker run --rm -v :/data busybox ls -l /data - ``` - -3. **Remove a volume**: - ```bash - docker volume rm - ``` -4. **Prune unused volumes**: - ```bash - docker volume prune - ``` - -5. **Backup volume for migration**: - ```bash - docker run --rm -v :/volume -v $(pwd):/backup alpine tar cvf /backup/backup.tar /volume - ``` - -6. **Restore volume from backup**: - ```bash - docker run --rm -v :/volume -v $(pwd):/backup alpine sh -c "cd /volume && tar xvf /backup/backup.tar --strip 1" - ``` - -7. **Remove all volumes**: - ```bash - docker volume rm $(docker volume ls -q) - ``` - -**NOTE**: For PostgreSQL specifically, you might want to consider using PostgreSQL's built-in tools like `pg_dump` and `pg_restore` for more reliable backups, especially for live databases. - -## Monitoring - -### FastAPI Metrics - -FastAPI is configured to expose Prometheus metrics at the `/metrics` endpoint. These metrics include: - -- Request counts -- Request latencies -- Status codes - -Prometheus is configured to scrape these metrics automatically. - -### Visualizing Metrics in Grafana - -1. Log into Grafana at http://localhost:3000 (default credentials: admin/admin_password) -2. Add Prometheus as a data source (URL: http://prometheus:9090) -3. Import dashboards for FastAPI, PostgreSQL, and Qdrant - -#### Dashboards URLs - -https://grafana.com/grafana/dashboards/18739-fastapi-observability/ - -https://grafana.com/grafana/dashboards/1860-node-exporter-full/ - -https://grafana.com/grafana/dashboards/23033-qdrant/ - -https://grafana.com/grafana/dashboards/12485-postgresql-exporter/ - - -## Development Workflow - -The FastAPI application is configured with hot-reloading. Any changes to the code in the `src/` directory will automatically reload the application. - -## Troubleshooting - -### Connection Errors - -If you see connection errors when starting the services: - -1. **Database Connection Refused**: This often happens when the FastAPI app tries to connect to databases before they're ready. - ``` - Connection refused: [Errno 111] Connection refused - ``` - - Solutions: - - Start database services first, wait, then start the application - - Check database logs: `docker compose logs pgvector` - - Ensure your database credentials in `.env.app` match those in `.env.postgres` - -2. **Restart the FastAPI service** after databases are running: - ```bash - docker compose restart fastapi - ``` - -3. **Check service status**: - ```bash - docker compose ps - ``` - -4. **View logs** for more details: - ```bash - docker compose logs --tail=100 fastapi - docker compose logs --tail=100 pgvector - ``` diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml deleted file mode 100644 index 8483a53b..00000000 --- a/docker/docker-compose.yml +++ /dev/null @@ -1,250 +0,0 @@ -services: - # FastAPI Application - fastapi: - build: - context: .. - dockerfile: docker/minirag/Dockerfile - container_name: fastapi - ports: - - "8000:8000" - volumes: - - fastapi_data:/app/assets - networks: - - backend - restart: always - depends_on: - pgvector: - condition: service_healthy - env_file: - - ./env/.env.app - - # Celery Worker - celery-worker: - build: - context: .. - dockerfile: docker/minirag/Dockerfile - container_name: celery-worker - volumes: - - fastapi_data:/app/assets - networks: - - backend - restart: always - depends_on: - rabbitmq: - condition: service_healthy - redis: - condition: service_healthy - pgvector: - condition: service_healthy - env_file: - - ./env/.env.app - command: ["python", "-m", "celery", "-A", "celery_app", "worker", "--queues=default,file_processing,data_indexing", "--loglevel=info"] - - # Celery Beat Scheduler - celery-beat: - build: - context: .. - dockerfile: docker/minirag/Dockerfile - container_name: celery-beat - volumes: - - fastapi_data:/app/assets - - celery_beat_data:/app/celerybeat - networks: - - backend - restart: always - depends_on: - rabbitmq: - condition: service_healthy - redis: - condition: service_healthy - env_file: - - ./env/.env.app - command: ["python", "-m", "celery", "-A", "celery_app", "beat", "--loglevel=info"] - - # Flower Dashboard - flower: - build: - context: .. - dockerfile: docker/minirag/Dockerfile - container_name: flower - ports: - - "5555:5555" - networks: - - backend - restart: always - depends_on: - - rabbitmq - - celery-worker - env_file: - - ./env/.env.app - command: ["python", "-m", "celery", "-A", "celery_app", "flower", "--conf=flowerconfig.py"] - - # Nginx Service - nginx: - image: nginx:stable-alpine3.20-perl - container_name: nginx - ports: - - "80:80" - volumes: - - ./nginx/default.conf:/etc/nginx/conf.d/default.conf - depends_on: - - fastapi - networks: - - backend - restart: always - - - # PostgreSQL (pgvector) - pgvector: - image: pgvector/pgvector:0.8.0-pg17 - container_name: pgvector - ports: - - "5400:5432" - volumes: - - 'pgvector:/var/lib/postgresql/data' - env_file: - - ./env/.env.postgres - networks: - - backend - restart: always - healthcheck: - test: ["CMD-SHELL", "pg_isready -U postgres"] - interval: 5s - timeout: 5s - retries: 5 - start_period: 10s - - # Qdrant (VectorDB) - qdrant: - image: qdrant/qdrant:v1.13.6 - container_name: qdrant - ports: - - "6333:6333" - - "6334:6334" - volumes: - - qdrant_data:/qdrant/storage - networks: - - backend - restart: always - - # Prometheus Monitoring - prometheus: - image: prom/prometheus:v3.3.0 - container_name: prometheus - ports: - - "9090:9090" - volumes: - - prometheus_data:/prometheus - - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml - networks: - - backend - restart: always - command: - - '--config.file=/etc/prometheus/prometheus.yml' - - '--storage.tsdb.path=/prometheus' - - '--web.console.libraries=/etc/prometheus/console_libraries' - - '--web.console.templates=/etc/prometheus/consoles' - - '--web.enable-lifecycle' - - # Grafana Dashboard - grafana: - image: grafana/grafana:11.6.0-ubuntu - container_name: grafana - ports: - - "3000:3000" - volumes: - - grafana_data:/var/lib/grafana - env_file: - - ./env/.env.grafana - depends_on: - - prometheus - networks: - - backend - restart: always - - - # Node Exporter for system metrics - node-exporter: - image: prom/node-exporter:v1.9.1 - container_name: node-exporter - ports: - - "9100:9100" - volumes: - - /proc:/host/proc:ro - - /sys:/host/sys:ro - - /:/rootfs:ro - command: - - '--path.procfs=/host/proc' - - '--path.rootfs=/rootfs' - - '--path.sysfs=/host/sys' - - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' - networks: - - backend - restart: always - - # PostgreSQL Exporter for Postgres metrics - postgres-exporter: - image: prometheuscommunity/postgres-exporter:v0.17.1 - container_name: postgres-exporter - ports: - - "9187:9187" - env_file: - - ./env/.env.postgres-exporter - depends_on: - - pgvector - networks: - - backend - restart: always - - # RabbitMQ (Message Broker) - rabbitmq: - image: rabbitmq:4.1.2-management-alpine - container_name: rabbitmq - ports: - - "5672:5672" # AMQP port - - "15672:15672" # Management UI port - volumes: - - rabbitmq_data:/var/lib/rabbitmq - - ./rabbitmq/rabbitmq.conf:/etc/rabbitmq/rabbitmq.conf - env_file: - - ./env/.env.rabbitmq - networks: - - backend - restart: always - healthcheck: - test: ["CMD", "rabbitmq-diagnostics", "ping"] - timeout: 10s - retries: 5 - - # Redis (Results Backend & Cache) - redis: - image: redis:8.0.3-alpine - container_name: redis - ports: - - "6379:6379" - volumes: - - redis_data:/data - env_file: - - ./env/.env.redis - networks: - - backend - restart: always - healthcheck: - test: ["CMD", "redis-cli", "ping"] - timeout: 10s - retries: 5 - command: ["redis-server", "--appendonly", "yes", "--requirepass", "${REDIS_PASSWORD:-minirag_redis_2222}"] - -networks: - backend: - driver: bridge - -volumes: - fastapi_data: - pgvector: - qdrant_data: - prometheus_data: - grafana_data: - rabbitmq_data: - redis_data: - celery_beat_data: diff --git a/docker/env/.env.example.app b/docker/env/.env.example.app deleted file mode 100644 index fa576a30..00000000 --- a/docker/env/.env.example.app +++ /dev/null @@ -1,49 +0,0 @@ -APP_NAME="mini-RAG" -APP_VERSION="0.1" - -FILE_ALLOWED_TYPES=["text/plain", "application/pdf"] -FILE_MAX_SIZE=10 -FILE_DEFAULT_CHUNK_SIZE=512000 # 512KB - -POSTGRES_USERNAME="postgres" -POSTGRES_PASSWORD="postgres_password" -POSTGRES_HOST="pgvector" -POSTGRES_PORT=5432 -POSTGRES_MAIN_DATABASE="minirag" - -# ========================= LLM Config ========================= -GENERATION_BACKEND = "OPENAI" -EMBEDDING_BACKEND = "COHERE" - -OPENAI_API_KEY="key___" -OPENAI_API_URL= "" -COHERE_API_KEY="key___" - -GENERATION_MODEL_ID_LITERAL = ["gpt-4o-mini", "gemma2:9b-instruct-q5_0"] -GENERATION_MODEL_ID="gpt-4o-mini" -EMBEDDING_MODEL_ID="embed-multilingual-v3.0" -EMBEDDING_MODEL_SIZE=1024 - -INPUT_DAFAULT_MAX_CHARACTERS=1024 -GENERATION_DAFAULT_MAX_TOKENS=200 -GENERATION_DAFAULT_TEMPERATURE=0.1 - -# ========================= Vector DB Config ========================= -VECTOR_DB_BACKEND_LITERAL = ["QDRANT", "PGVECTOR"] -VECTOR_DB_BACKEND = "PGVECTOR" -VECTOR_DB_PATH = "qdrant_db" -VECTOR_DB_DISTANCE_METHOD = "cosine" -VECTOR_DB_PGVEC_INDEX_THRESHOLD = 100 - -# ========================= Template Config ========================= -PRIMARY_LANG = "en" -DEFAULT_LANG = "en" - -# ========================= Celery Task Queue Config ========================= -CELERY_BROKER_URL="amqp://minirag_user:minirag_rabbitmq_2222@localhost:5672/minirag_vhost" -CELERY_RESULT_BACKEND="redis://:minirag_redis_2222@localhost:6379/0" -CELERY_TASK_SERIALIZER="json" -CELERY_TASK_TIME_LIMIT=600 -CELERY_TASK_ACKS_LATE=false -CELERY_WORKER_CONCURRENCY=2 -CELERY_FLOWER_PASSWORD="minirag_flower_2222" \ No newline at end of file diff --git a/docker/env/.env.example.grafana b/docker/env/.env.example.grafana deleted file mode 100644 index 127bedc9..00000000 --- a/docker/env/.env.example.grafana +++ /dev/null @@ -1,4 +0,0 @@ -# Grafana Environment Variables -GF_SECURITY_ADMIN_USER=admin -GF_SECURITY_ADMIN_PASSWORD=admin_password -GF_USERS_ALLOW_SIGN_UP=false diff --git a/docker/env/.env.example.postgres b/docker/env/.env.example.postgres deleted file mode 100644 index cf4f155e..00000000 --- a/docker/env/.env.example.postgres +++ /dev/null @@ -1,4 +0,0 @@ -# PostgreSQL Environment Variables -POSTGRES_USER=postgres -POSTGRES_PASSWORD=postgres_password -POSTGRES_DB=minirag diff --git a/docker/env/.env.example.postgres-exporter b/docker/env/.env.example.postgres-exporter deleted file mode 100644 index 27483a27..00000000 --- a/docker/env/.env.example.postgres-exporter +++ /dev/null @@ -1,3 +0,0 @@ -DATA_SOURCE_URI=pgvector:5432/postgres?sslmode=disable -DATA_SOURCE_USER=postgres -DATA_SOURCE_PASS=postgres_password \ No newline at end of file diff --git a/docker/env/.env.example.rabbitmq b/docker/env/.env.example.rabbitmq deleted file mode 100644 index 9c8e1174..00000000 --- a/docker/env/.env.example.rabbitmq +++ /dev/null @@ -1,13 +0,0 @@ -# RabbitMQ Configuration Example -RABBITMQ_DEFAULT_USER=minirag_user -RABBITMQ_DEFAULT_PASS=minirag_rabbitmq_2222 -RABBITMQ_DEFAULT_VHOST=minirag_vhost - -# Management Plugin -RABBITMQ_MANAGEMENT_ENABLED=true - -# Security -RABBITMQ_AUTH_BACKENDS=rabbit_auth_backend_internal - -# Performance -RABBITMQ_DISK_FREE_LIMIT=2000000000 diff --git a/docker/env/.env.example.redis b/docker/env/.env.example.redis deleted file mode 100644 index 01ce9b38..00000000 --- a/docker/env/.env.example.redis +++ /dev/null @@ -1,12 +0,0 @@ -# Redis Configuration Example -REDIS_PASSWORD=minirag_redis_2222 - -# Persistence -REDIS_APPENDONLY=yes - -# Memory Management -REDIS_MAXMEMORY=512mb -REDIS_MAXMEMORY_POLICY=allkeys-lru - -# Security -REDIS_PROTECTED_MODE=yes diff --git a/docker/minirag.service b/docker/minirag.service deleted file mode 100644 index 6d998bd3..00000000 --- a/docker/minirag.service +++ /dev/null @@ -1,23 +0,0 @@ -[Unit] -Description=MiniRAG Docker Service -After=network.target docker.service -Requires=docker.service - -[Service] -Type=forking -RemainAfterExit=yes -User=github_user -Group=docker -WorkingDirectory=/home/github_user/workspace/mini-rag/docker -ExecStartPre=/bin/bash -c '/usr/bin/docker compose down || true' -ExecStartPre=/bin/sleep 5 -ExecStart=/usr/bin/docker compose up --build -d -ExecStop=/usr/bin/docker compose down -ExecReload=/usr/bin/docker compose restart -TimeoutStartSec=300 -TimeoutStopSec=120 -Restart=on-failure -RestartSec=10 - -[Install] -WantedBy=multi-user.target \ No newline at end of file diff --git a/docker/minirag/Dockerfile b/docker/minirag/Dockerfile deleted file mode 100644 index 7447fbab..00000000 --- a/docker/minirag/Dockerfile +++ /dev/null @@ -1,35 +0,0 @@ -FROM ghcr.io/astral-sh/uv:0.6.14-python3.10-bookworm - -WORKDIR /app - -# Install additional system dependencies for lxml and other packages -RUN apt-get update && apt-get install -y \ - build-essential \ - libavif-dev pkg-config \ - libjpeg-dev \ - gcc unzip zip \ - python3-dev \ - libxml2-dev \ - libxslt1-dev \ - libffi-dev \ - curl \ - && rm -rf /var/lib/apt/lists/* - -COPY src/requirements.txt . - -RUN uv pip install -r requirements.txt --system - -COPY src/ . - -# Create directory structure for Alembic -RUN mkdir -p /app/models/db_schemes/minirag/ - -COPY docker/minirag/alembic.ini /app/models/db_schemes/minirag/alembic.ini - -COPY docker/minirag/entrypoint.sh /entrypoint.sh -RUN chmod +x /entrypoint.sh - -ENTRYPOINT ["/entrypoint.sh"] - -# Command to run the application -CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4"] diff --git a/docker/minirag/alembic.example.ini b/docker/minirag/alembic.example.ini deleted file mode 100644 index 346ec24f..00000000 --- a/docker/minirag/alembic.example.ini +++ /dev/null @@ -1,117 +0,0 @@ -# A generic, single database configuration. - -[alembic] -# path to migration scripts -# Use forward slashes (/) also on windows to provide an os agnostic path -script_location = alembic - -# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s -# Uncomment the line below if you want the files to be prepended with date and time -# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file -# for all available tokens -# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s - -# sys.path path, will be prepended to sys.path if present. -# defaults to the current working directory. -prepend_sys_path = . - -# timezone to use when rendering the date within the migration file -# as well as the filename. -# If specified, requires the python>=3.9 or backports.zoneinfo library. -# Any required deps can installed by adding `alembic[tz]` to the pip requirements -# string value is passed to ZoneInfo() -# leave blank for localtime -# timezone = - -# max length of characters to apply to the "slug" field -# truncate_slug_length = 40 - -# set to 'true' to run the environment during -# the 'revision' command, regardless of autogenerate -# revision_environment = false - -# set to 'true' to allow .pyc and .pyo files without -# a source .py file to be detected as revisions in the -# versions/ directory -# sourceless = false - -# version location specification; This defaults -# to alembic/versions. When using multiple version -# directories, initial revisions must be specified with --version-path. -# The path separator used here should be the separator specified by "version_path_separator" below. -# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions - -# version path separator; As mentioned above, this is the character used to split -# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. -# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. -# Valid values for version_path_separator are: -# -# version_path_separator = : -# version_path_separator = ; -# version_path_separator = space -# version_path_separator = newline -version_path_separator = os # Use os.pathsep. Default configuration used for new projects. - -# set to 'true' to search source files recursively -# in each "version_locations" directory -# new in Alembic version 1.10 -# recursive_version_locations = false - -# the output encoding used when revision files -# are written from script.py.mako -# output_encoding = utf-8 - -sqlalchemy.url = postgresql://postgres:postgres_password@pgvector:5432/minirag - - -[post_write_hooks] -# post_write_hooks defines scripts or Python functions that are run -# on newly generated revision scripts. See the documentation for further -# detail and examples - -# format using "black" - use the console_scripts runner, against the "black" entrypoint -# hooks = black -# black.type = console_scripts -# black.entrypoint = black -# black.options = -l 79 REVISION_SCRIPT_FILENAME - -# lint with attempts to fix using "ruff" - use the exec runner, execute a binary -# hooks = ruff -# ruff.type = exec -# ruff.executable = %(here)s/.venv/bin/ruff -# ruff.options = --fix REVISION_SCRIPT_FILENAME - -# Logging configuration -[loggers] -keys = root,sqlalchemy,alembic - -[handlers] -keys = console - -[formatters] -keys = generic - -[logger_root] -level = WARNING -handlers = console -qualname = - -[logger_sqlalchemy] -level = WARNING -handlers = -qualname = sqlalchemy.engine - -[logger_alembic] -level = INFO -handlers = -qualname = alembic - -[handler_console] -class = StreamHandler -args = (sys.stderr,) -level = NOTSET -formatter = generic - -[formatter_generic] -format = %(levelname)-5.5s [%(name)s] %(message)s -datefmt = %H:%M:%S diff --git a/docker/minirag/entrypoint.sh b/docker/minirag/entrypoint.sh deleted file mode 100644 index 198c8cb0..00000000 --- a/docker/minirag/entrypoint.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -set -e - -echo "Running database migrations..." -cd /app/models/db_schemes/minirag/ -alembic upgrade head -cd /app diff --git a/docker/nginx/default.conf b/docker/nginx/default.conf deleted file mode 100644 index 87632089..00000000 --- a/docker/nginx/default.conf +++ /dev/null @@ -1,17 +0,0 @@ -server { - listen 80; - server_name localhost; - - location / { - proxy_pass http://fastapi:8000; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - } - - # Optionally expose metrics endpoint directly - location /TrhBVe_m5gg2002_E5VVqS { - proxy_pass http://fastapi:8000/TrhBVe_m5gg2002_E5VVqS; - } -} \ No newline at end of file diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml deleted file mode 100644 index 802caa6d..00000000 --- a/docker/prometheus/prometheus.yml +++ /dev/null @@ -1,27 +0,0 @@ -global: - scrape_interval: 15s - evaluation_interval: 15s - -scrape_configs: - - - job_name: 'fastapi' - static_configs: - - targets: ['fastapi:8000'] - metrics_path: '/TrhBVe_m5gg2002_E5VVqS' - - - job_name: 'node-exporter' - static_configs: - - targets: ['node-exporter:9100'] - - - job_name: 'prometheus' - static_configs: - - targets: ['localhost:9090'] - - - job_name: 'qdrant' - static_configs: - - targets: ['qdrant:6333'] - metrics_path: '/metrics' - - - job_name: 'postgres' - static_configs: - - targets: ['postgres-exporter:9187'] diff --git a/docker/rabbitmq/rabbitmq.conf b/docker/rabbitmq/rabbitmq.conf deleted file mode 100644 index 5cf61d57..00000000 --- a/docker/rabbitmq/rabbitmq.conf +++ /dev/null @@ -1,18 +0,0 @@ -# RabbitMQ Configuration File - -# Memory management -vm_memory_high_watermark.relative = 0.6 - -# Disk space management -disk_free_limit.absolute = 2GB - -# SSL/TLS configuration -ssl_options.verify = verify_none - -# Management plugin -management.tcp.port = 15672 - -# Logging -log.file.level = info -log.console = true -log.console.level = info diff --git a/src/.env.example b/src/.env.example deleted file mode 100644 index 40209588..00000000 --- a/src/.env.example +++ /dev/null @@ -1,50 +0,0 @@ -APP_NAME="mini-RAG" -APP_VERSION="0.1" -OPENAI_API_KEY="sk-" - -FILE_ALLOWED_TYPES=["text/plain", "application/pdf"] -FILE_MAX_SIZE=10 -FILE_DEFAULT_CHUNK_SIZE=512000 # 512KB - -POSTGRES_USERNAME="postgres" -POSTGRES_PASSWORD="minirag2222" -POSTGRES_HOST="localhost" -POSTGRES_PORT=5432 -POSTGRES_MAIN_DATABASE="minirag" - -# ========================= LLM Config ========================= -GENERATION_BACKEND = "OPENAI" -EMBEDDING_BACKEND = "COHERE" - -OPENAI_API_KEY="sk-" -OPENAI_API_URL= -COHERE_API_KEY="m8-" - -GENERATION_MODEL_ID_LITERAL = ["gpt-4o-mini", "gpt-4o"] -GENERATION_MODEL_ID="gpt-4o-mini" -EMBEDDING_MODEL_ID="embed-multilingual-light-v3.0" -EMBEDDING_MODEL_SIZE=384 - -INPUT_DAFAULT_MAX_CHARACTERS=1024 -GENERATION_DAFAULT_MAX_TOKENS=200 -GENERATION_DAFAULT_TEMPERATURE=0.1 - -# ========================= Vector DB Config ========================= -VECTOR_DB_BACKEND_LITERAL = ["QDRANT", "PGVECTOR"] -VECTOR_DB_BACKEND = "PGVECTOR" -VECTOR_DB_PATH = "qdrant_db" -VECTOR_DB_DISTANCE_METHOD = "cosine" -VECTOR_DB_PGVEC_INDEX_THRESHOLD = - -# ========================= Template Configs ========================= -PRIMARY_LANG = "ar" -DEFAULT_LANG = "en" - -# ========================= Celery Task Queue Config ========================= -CELERY_BROKER_URL="amqp://minirag_user:minirag_rabbitmq_2222@localhost:5672/minirag_vhost" -CELERY_RESULT_BACKEND="redis://:minirag_redis_2222@localhost:6379/0" -CELERY_TASK_SERIALIZER="json" -CELERY_TASK_TIME_LIMIT=600 -CELERY_TASK_ACKS_LATE=false -CELERY_WORKER_CONCURRENCY=2 -CELERY_FLOWER_PASSWORD="minirag_flower_2222" diff --git a/src/.gitignore b/src/.gitignore deleted file mode 100644 index 3069115f..00000000 --- a/src/.gitignore +++ /dev/null @@ -1,163 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so -*.bak -*.dat -*.dir - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/#use-with-ide -.pdm.toml - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ diff --git a/src/assets/.gitignore b/src/assets/.gitignore deleted file mode 100644 index ac32e2b6..00000000 --- a/src/assets/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -files -database diff --git a/src/assets/.gitkeep b/src/assets/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/src/assets/mini-rag-app.postman_collection.json b/src/assets/mini-rag-app.postman_collection.json deleted file mode 100644 index 2d58f447..00000000 --- a/src/assets/mini-rag-app.postman_collection.json +++ /dev/null @@ -1,57 +0,0 @@ -{ - "info": { - "_postman_id": "1f67dedb-1b93-4639-9f96-7fe8681693f4", - "name": "mini-rag-app", - "schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json", - "_exporter_id": "854486", - "_collection_link": "https://www.postman.com/gold-water-645258/workspace/mini-rag-app/collection/854486-1f67dedb-1b93-4639-9f96-7fe8681693f4?action=share&source=collection_link&creator=854486" - }, - "item": [ - { - "name": "welcome-request", - "request": { - "method": "GET", - "header": [], - "url": { - "raw": "{{api}}/welcome", - "host": [ - "{{api}}" - ], - "path": [ - "welcome" - ] - } - }, - "response": [] - } - ], - "event": [ - { - "listen": "prerequest", - "script": { - "type": "text/javascript", - "packages": {}, - "exec": [ - "" - ] - } - }, - { - "listen": "test", - "script": { - "type": "text/javascript", - "packages": {}, - "exec": [ - "" - ] - } - } - ], - "variable": [ - { - "key": "api", - "value": "http://127.0.0.1:8000", - "type": "string" - } - ] -} \ No newline at end of file diff --git a/src/celery_app.py b/src/celery_app.py deleted file mode 100644 index 4831ab8e..00000000 --- a/src/celery_app.py +++ /dev/null @@ -1,107 +0,0 @@ -from celery import Celery -from helpers.config import get_settings - -from stores.llm.LLMProviderFactory import LLMProviderFactory -from stores.vectordb.VectorDBProviderFactory import VectorDBProviderFactory -from stores.llm.templates.template_parser import TemplateParser -from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession -from sqlalchemy.orm import sessionmaker - -settings = get_settings() - -async def get_setup_utils(): - settings = get_settings() - - postgres_conn = f"postgresql+asyncpg://{settings.POSTGRES_USERNAME}:{settings.POSTGRES_PASSWORD}@{settings.POSTGRES_HOST}:{settings.POSTGRES_PORT}/{settings.POSTGRES_MAIN_DATABASE}" - - db_engine = create_async_engine(postgres_conn) - db_client = sessionmaker( - db_engine, class_=AsyncSession, expire_on_commit=False - ) - - llm_provider_factory = LLMProviderFactory(settings) - vectordb_provider_factory = VectorDBProviderFactory(config=settings, db_client=db_client) - - # generation client - generation_client = llm_provider_factory.create(provider=settings.GENERATION_BACKEND) - generation_client.set_generation_model(model_id = settings.GENERATION_MODEL_ID) - - # embedding client - embedding_client = llm_provider_factory.create(provider=settings.EMBEDDING_BACKEND) - embedding_client.set_embedding_model(model_id=settings.EMBEDDING_MODEL_ID, - embedding_size=settings.EMBEDDING_MODEL_SIZE) - - # vector db client - vectordb_client = vectordb_provider_factory.create( - provider=settings.VECTOR_DB_BACKEND - ) - await vectordb_client.connect() - - template_parser = TemplateParser( - language=settings.PRIMARY_LANG, - default_language=settings.DEFAULT_LANG, - ) - - return (db_engine, db_client, llm_provider_factory, vectordb_provider_factory, - generation_client, embedding_client, vectordb_client, template_parser) - -# Create Celery application instance -celery_app = Celery( - "minirag", - broker=settings.CELERY_BROKER_URL, - backend=settings.CELERY_RESULT_BACKEND, - include=[ - "tasks.file_processing", - "tasks.data_indexing", - "tasks.process_workflow", - "tasks.maintenance", - ] -) - -# Configure Celery with essential settings -celery_app.conf.update( - task_serializer=settings.CELERY_TASK_SERIALIZER, - result_serializer=settings.CELERY_TASK_SERIALIZER, - accept_content=[ - settings.CELERY_TASK_SERIALIZER - ], - - # Task safety - Late acknowledgment prevents task loss on worker crash - task_acks_late=settings.CELERY_TASK_ACKS_LATE, - - # Time limits - Prevent hanging tasks - task_time_limit=settings.CELERY_TASK_TIME_LIMIT, - - # Result backend - Store results for status tracking - task_ignore_resul=False, - result_expires=3600, - - # Worker settings - worker_concurrency=settings.CELERY_WORKER_CONCURRENCY, - - # Connection settings for better reliability - broker_connection_retry_on_startup=True, - broker_connection_retry=True, - broker_connection_max_retries=10, - worker_cancel_long_running_tasks_on_connection_loss=True, - - task_routes={ - "tasks.file_processing.process_project_files": {"queue": "file_processing"}, - "tasks.data_indexing.index_data_content": {"queue": "data_indexing"}, - "tasks.process_workflow.process_and_push_workflow": {"queue": "file_processing"}, - "tasks.maintenance.clean_celery_executions_table": {"queue": "default"}, - }, - - beat_schedule={ - 'cleanup-old-task-records': { - 'task': "tasks.maintenance.clean_celery_executions_table", - 'schedule': 10, - 'args': () - } - }, - - timezone='UTC', - -) - -celery_app.conf.task_default_queue = "default" \ No newline at end of file diff --git a/src/controllers/BaseController.py b/src/controllers/BaseController.py deleted file mode 100644 index aa3e573d..00000000 --- a/src/controllers/BaseController.py +++ /dev/null @@ -1,35 +0,0 @@ -from helpers.config import get_settings, Settings -import os -import random -import string - -class BaseController: - - def __init__(self): - - self.app_settings = get_settings() - - self.base_dir = os.path.dirname( os.path.dirname(__file__) ) - self.files_dir = os.path.join( - self.base_dir, - "assets/files" - ) - - self.database_dir = os.path.join( - self.base_dir, - "assets/database" - ) - - def generate_random_string(self, length: int=12): - return ''.join(random.choices(string.ascii_lowercase + string.digits, k=length)) - - def get_database_path(self, db_name: str): - - database_path = os.path.join( - self.database_dir, db_name - ) - - if not os.path.exists(database_path): - os.makedirs(database_path) - - return database_path \ No newline at end of file diff --git a/src/controllers/DataController.py b/src/controllers/DataController.py deleted file mode 100644 index d4bd5aab..00000000 --- a/src/controllers/DataController.py +++ /dev/null @@ -1,57 +0,0 @@ -from .BaseController import BaseController -from .ProjectController import ProjectController -from fastapi import UploadFile -from models import ResponseSignal -import re -import os - -class DataController(BaseController): - - def __init__(self): - super().__init__() - self.size_scale = 1048576 # convert MB to bytes - - def validate_uploaded_file(self, file: UploadFile): - - if file.content_type not in self.app_settings.FILE_ALLOWED_TYPES: - return False, ResponseSignal.FILE_TYPE_NOT_SUPPORTED.value - - if file.size > self.app_settings.FILE_MAX_SIZE * self.size_scale: - return False, ResponseSignal.FILE_SIZE_EXCEEDED.value - - return True, ResponseSignal.FILE_VALIDATED_SUCCESS.value - - def generate_unique_filepath(self, orig_file_name: str, project_id: str): - - random_key = self.generate_random_string() - project_path = ProjectController().get_project_path(project_id=project_id) - - cleaned_file_name = self.get_clean_file_name( - orig_file_name=orig_file_name - ) - - new_file_path = os.path.join( - project_path, - random_key + "_" + cleaned_file_name - ) - - while os.path.exists(new_file_path): - random_key = self.generate_random_string() - new_file_path = os.path.join( - project_path, - random_key + "_" + cleaned_file_name - ) - - return new_file_path, random_key + "_" + cleaned_file_name - - def get_clean_file_name(self, orig_file_name: str): - - # remove any special characters, except underscore and . - cleaned_file_name = re.sub(r'[^\w.]', '', orig_file_name.strip()) - - # replace spaces with underscore - cleaned_file_name = cleaned_file_name.replace(" ", "_") - - return cleaned_file_name - - diff --git a/src/controllers/NLPController.py b/src/controllers/NLPController.py deleted file mode 100644 index c2e49095..00000000 --- a/src/controllers/NLPController.py +++ /dev/null @@ -1,141 +0,0 @@ -from .BaseController import BaseController -from models.db_schemes import Project, DataChunk -from stores.llm.LLMEnums import DocumentTypeEnum -from typing import List -import json - -class NLPController(BaseController): - - def __init__(self, vectordb_client, generation_client, - embedding_client, template_parser): - super().__init__() - - self.vectordb_client = vectordb_client - self.generation_client = generation_client - self.embedding_client = embedding_client - self.template_parser = template_parser - - def create_collection_name(self, project_id: str): - return f"collection_{self.vectordb_client.default_vector_size}_{project_id}".strip() - - async def reset_vector_db_collection(self, project: Project): - collection_name = self.create_collection_name(project_id=project.project_id) - return await self.vectordb_client.delete_collection(collection_name=collection_name) - - async def get_vector_db_collection_info(self, project: Project): - collection_name = self.create_collection_name(project_id=project.project_id) - collection_info = await self.vectordb_client.get_collection_info(collection_name=collection_name) - - return json.loads( - json.dumps(collection_info, default=lambda x: x.__dict__) - ) - - async def index_into_vector_db(self, project: Project, chunks: List[DataChunk], - chunks_ids: List[int], - do_reset: bool = False): - - # step1: get collection name - collection_name = self.create_collection_name(project_id=project.project_id) - - # step2: manage items - texts = [ c.chunk_text for c in chunks ] - metadata = [ c.chunk_metadata for c in chunks] - vectors = self.embedding_client.embed_text(text=texts, - document_type=DocumentTypeEnum.DOCUMENT.value) - - # step3: create collection if not exists - _ = await self.vectordb_client.create_collection( - collection_name=collection_name, - embedding_size=self.embedding_client.embedding_size, - do_reset=do_reset, - ) - - # step4: insert into vector db - _ = await self.vectordb_client.insert_many( - collection_name=collection_name, - texts=texts, - metadata=metadata, - vectors=vectors, - record_ids=chunks_ids, - ) - - return True - - async def search_vector_db_collection(self, project: Project, text: str, limit: int = 10): - - # step1: get collection name - query_vector = None - collection_name = self.create_collection_name(project_id=project.project_id) - - # step2: get text embedding vector - vectors = self.embedding_client.embed_text(text=text, - document_type=DocumentTypeEnum.QUERY.value) - - if not vectors or len(vectors) == 0: - return False - - if isinstance(vectors, list) and len(vectors) > 0: - query_vector = vectors[0] - - if not query_vector: - return False - - # step3: do semantic search - results = await self.vectordb_client.search_by_vector( - collection_name=collection_name, - vector=query_vector, - limit=limit - ) - - if not results: - return False - - return results - - async def answer_rag_question(self, project: Project, query: str, limit: int = 10): - - answer, full_prompt, chat_history = None, None, None - - # step1: retrieve related documents - retrieved_documents = await self.search_vector_db_collection( - project=project, - text=query, - limit=limit, - ) - - if not retrieved_documents or len(retrieved_documents) == 0: - return answer, full_prompt, chat_history - - # step2: Construct LLM prompt - system_prompt = self.template_parser.get("rag", "system_prompt") - - documents_prompts = "\n".join([ - self.template_parser.get("rag", "document_prompt", { - "doc_num": idx + 1, - "chunk_text": self.generation_client.process_text(doc.text), - }) - for idx, doc in enumerate(retrieved_documents) - ]) - - footer_prompt = self.template_parser.get("rag", "footer_prompt", { - "query": query - }) - - # step3: Construct Generation Client Prompts - chat_history = [ - self.generation_client.construct_prompt( - prompt=system_prompt, - role=self.generation_client.enums.SYSTEM.value, - ) - ] - - full_prompt = "\n\n".join([ documents_prompts, footer_prompt]) - - # step4: Retrieve the Answer - answer = self.generation_client.generate_text( - prompt=full_prompt, - chat_history=chat_history - ) - - return answer, full_prompt, chat_history - diff --git a/src/controllers/ProcessController.py b/src/controllers/ProcessController.py deleted file mode 100644 index ecaa0877..00000000 --- a/src/controllers/ProcessController.py +++ /dev/null @@ -1,109 +0,0 @@ -from .BaseController import BaseController -from .ProjectController import ProjectController -import os -from langchain_community.document_loaders import TextLoader -from langchain_community.document_loaders import PyMuPDFLoader -from models import ProcessingEnum -from typing import List -from dataclasses import dataclass - -@dataclass -class Document: - page_content: str - metadata: dict - -class ProcessController(BaseController): - - def __init__(self, project_id: str): - super().__init__() - - self.project_id = project_id - self.project_path = ProjectController().get_project_path(project_id=project_id) - - def get_file_extension(self, file_id: str): - return os.path.splitext(file_id)[-1] - - def get_file_loader(self, file_id: str): - - file_ext = self.get_file_extension(file_id=file_id) - file_path = os.path.join( - self.project_path, - file_id - ) - - if not os.path.exists(file_path): - return None - - if file_ext == ProcessingEnum.TXT.value: - return TextLoader(file_path, encoding="utf-8") - - if file_ext == ProcessingEnum.PDF.value: - return PyMuPDFLoader(file_path) - - return None - - def get_file_content(self, file_id: str): - - loader = self.get_file_loader(file_id=file_id) - if loader: - return loader.load() - - return None - - def process_file_content(self, file_content: list, file_id: str, - chunk_size: int=100, overlap_size: int=20): - - file_content_texts = [ - rec.page_content - for rec in file_content - ] - - file_content_metadata = [ - rec.metadata - for rec in file_content - ] - - # chunks = text_splitter.create_documents( - # file_content_texts, - # metadatas=file_content_metadata - # ) - - chunks = self.process_simpler_splitter( - texts=file_content_texts, - metadatas=file_content_metadata, - chunk_size=chunk_size, - ) - - return chunks - - def process_simpler_splitter(self, texts: List[str], metadatas: List[dict], chunk_size: int, splitter_tag: str="\n"): - - full_text = " ".join(texts) - - # split by splitter_tag - lines = [ doc.strip() for doc in full_text.split(splitter_tag) if len(doc.strip()) > 1 ] - - chunks = [] - current_chunk = "" - - for line in lines: - current_chunk += line + splitter_tag - if len(current_chunk) >= chunk_size: - chunks.append(Document( - page_content=current_chunk.strip(), - metadata={} - )) - - current_chunk = "" - - if len(current_chunk) >= 0: - chunks.append(Document( - page_content=current_chunk.strip(), - metadata={} - )) - - return chunks - - - - diff --git a/src/controllers/ProjectController.py b/src/controllers/ProjectController.py deleted file mode 100644 index b3105a57..00000000 --- a/src/controllers/ProjectController.py +++ /dev/null @@ -1,22 +0,0 @@ -from .BaseController import BaseController -from fastapi import UploadFile -from models import ResponseSignal -import os - -class ProjectController(BaseController): - - def __init__(self): - super().__init__() - - def get_project_path(self, project_id: str): - project_dir = os.path.join( - self.files_dir, - str(project_id) - ) - - if not os.path.exists(project_dir): - os.makedirs(project_dir) - - return project_dir - - diff --git a/src/controllers/__init__.py b/src/controllers/__init__.py deleted file mode 100644 index 8876467a..00000000 --- a/src/controllers/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .DataController import DataController -from .ProjectController import ProjectController -from .ProcessController import ProcessController -from .NLPController import NLPController - diff --git a/src/flowerconfig.py b/src/flowerconfig.py deleted file mode 100644 index 5bebec51..00000000 --- a/src/flowerconfig.py +++ /dev/null @@ -1,12 +0,0 @@ -from dotenv import dotenv_values -config = dotenv_values(".env") - -# Flower configuration -port = 5555 -max_tasks = 10000 -# db = 'flower.db' # SQLite database for persistent storage -auto_refresh = True - -# Authentication (optional) -basic_auth = [f'admin:{config["CELERY_FLOWER_PASSWORD"]}'] - diff --git a/src/helpers/__init__.py b/src/helpers/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/helpers/config.py b/src/helpers/config.py deleted file mode 100644 index fb8600b8..00000000 --- a/src/helpers/config.py +++ /dev/null @@ -1,57 +0,0 @@ -from pydantic_settings import BaseSettings, SettingsConfigDict -from typing import List - -class Settings(BaseSettings): - - APP_NAME: str - APP_VERSION: str - OPENAI_API_KEY: str - - FILE_ALLOWED_TYPES: list - FILE_MAX_SIZE: int - FILE_DEFAULT_CHUNK_SIZE: int - - POSTGRES_USERNAME: str - POSTGRES_PASSWORD: str - POSTGRES_HOST: str - POSTGRES_PORT: int - POSTGRES_MAIN_DATABASE: str - - GENERATION_BACKEND: str - EMBEDDING_BACKEND: str - - OPENAI_API_KEY: str = None - OPENAI_API_URL: str = None - COHERE_API_KEY: str = None - - GENERATION_MODEL_ID_LITERAL: List[str] = None - GENERATION_MODEL_ID: str = None - EMBEDDING_MODEL_ID: str = None - EMBEDDING_MODEL_SIZE: int = None - INPUT_DAFAULT_MAX_CHARACTERS: int = None - GENERATION_DAFAULT_MAX_TOKENS: int = None - GENERATION_DAFAULT_TEMPERATURE: float = None - - VECTOR_DB_BACKEND_LITERAL: List[str] = None - VECTOR_DB_BACKEND : str - VECTOR_DB_PATH : str - VECTOR_DB_DISTANCE_METHOD: str = None - VECTOR_DB_PGVEC_INDEX_THRESHOLD: int = 100 - - PRIMARY_LANG: str = "en" - DEFAULT_LANG: str = "en" - - # Celery Configuration - CELERY_BROKER_URL: str = None - CELERY_RESULT_BACKEND: str = None - CELERY_TASK_SERIALIZER: str = "json" - CELERY_TASK_TIME_LIMIT: int = 600 - CELERY_TASK_ACKS_LATE: bool = True - CELERY_WORKER_CONCURRENCY: int = 2 - CELERY_FLOWER_PASSWORD: str = None - - class Config: - env_file = ".env" - -def get_settings(): - return Settings() diff --git a/src/main.py b/src/main.py deleted file mode 100644 index ffd2c2e3..00000000 --- a/src/main.py +++ /dev/null @@ -1,61 +0,0 @@ -from fastapi import FastAPI -from routes import base, data, nlp -from helpers.config import get_settings -from stores.llm.LLMProviderFactory import LLMProviderFactory -from stores.vectordb.VectorDBProviderFactory import VectorDBProviderFactory -from stores.llm.templates.template_parser import TemplateParser -from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession -from sqlalchemy.orm import sessionmaker - -# Import metrics setup -from utils.metrics import setup_metrics - -app = FastAPI() - -# Setup Prometheus metrics -setup_metrics(app) - -async def startup_span(): - settings = get_settings() - - postgres_conn = f"postgresql+asyncpg://{settings.POSTGRES_USERNAME}:{settings.POSTGRES_PASSWORD}@{settings.POSTGRES_HOST}:{settings.POSTGRES_PORT}/{settings.POSTGRES_MAIN_DATABASE}" - - app.db_engine = create_async_engine(postgres_conn) - app.db_client = sessionmaker( - app.db_engine, class_=AsyncSession, expire_on_commit=False - ) - - llm_provider_factory = LLMProviderFactory(settings) - vectordb_provider_factory = VectorDBProviderFactory(config=settings, db_client=app.db_client) - - # generation client - app.generation_client = llm_provider_factory.create(provider=settings.GENERATION_BACKEND) - app.generation_client.set_generation_model(model_id = settings.GENERATION_MODEL_ID) - - # embedding client - app.embedding_client = llm_provider_factory.create(provider=settings.EMBEDDING_BACKEND) - app.embedding_client.set_embedding_model(model_id=settings.EMBEDDING_MODEL_ID, - embedding_size=settings.EMBEDDING_MODEL_SIZE) - - # vector db client - app.vectordb_client = vectordb_provider_factory.create( - provider=settings.VECTOR_DB_BACKEND - ) - await app.vectordb_client.connect() - - app.template_parser = TemplateParser( - language=settings.PRIMARY_LANG, - default_language=settings.DEFAULT_LANG, - ) - - -async def shutdown_span(): - app.db_engine.dispose() - await app.vectordb_client.disconnect() - -app.on_event("startup")(startup_span) -app.on_event("shutdown")(shutdown_span) - -app.include_router(base.base_router) -app.include_router(data.data_router) -app.include_router(nlp.nlp_router) diff --git a/src/models/AssetModel.py b/src/models/AssetModel.py deleted file mode 100644 index 594e3a96..00000000 --- a/src/models/AssetModel.py +++ /dev/null @@ -1,50 +0,0 @@ -from .BaseDataModel import BaseDataModel -from .db_schemes import Asset -from .enums.DataBaseEnum import DataBaseEnum -from bson import ObjectId -from sqlalchemy.future import select - -class AssetModel(BaseDataModel): - - def __init__(self, db_client: object): - super().__init__(db_client=db_client) - self.db_client = db_client - - @classmethod - async def create_instance(cls, db_client: object): - instance = cls(db_client) - return instance - - async def create_asset(self, asset: Asset): - - async with self.db_client() as session: - async with session.begin(): - session.add(asset) - await session.commit() - await session.refresh(asset) - return asset - - async def get_all_project_assets(self, asset_project_id: str, asset_type: str): - - async with self.db_client() as session: - stmt = select(Asset).where( - Asset.asset_project_id == asset_project_id, - Asset.asset_type == asset_type - ) - result = await session.execute(stmt) - records = result.scalars().all() - return records - - async def get_asset_record(self, asset_project_id: str, asset_name: str): - - async with self.db_client() as session: - stmt = select(Asset).where( - Asset.asset_project_id == asset_project_id, - Asset.asset_name == asset_name - ) - result = await session.execute(stmt) - record = result.scalar_one_or_none() - return record - - - diff --git a/src/models/BaseDataModel.py b/src/models/BaseDataModel.py deleted file mode 100644 index ab9bace1..00000000 --- a/src/models/BaseDataModel.py +++ /dev/null @@ -1,7 +0,0 @@ -from helpers.config import get_settings, Settings - -class BaseDataModel: - - def __init__(self, db_client: object): - self.db_client = db_client - self.app_settings = get_settings() diff --git a/src/models/ChunkModel.py b/src/models/ChunkModel.py deleted file mode 100644 index 8768d5fc..00000000 --- a/src/models/ChunkModel.py +++ /dev/null @@ -1,69 +0,0 @@ -from .BaseDataModel import BaseDataModel -from .db_schemes import DataChunk -from .enums.DataBaseEnum import DataBaseEnum -from bson.objectid import ObjectId -from pymongo import InsertOne -from sqlalchemy.future import select -from sqlalchemy import func, delete - -class ChunkModel(BaseDataModel): - - def __init__(self, db_client: object): - super().__init__(db_client=db_client) - self.db_client = db_client - - @classmethod - async def create_instance(cls, db_client: object): - instance = cls(db_client) - return instance - - async def create_chunk(self, chunk: DataChunk): - - async with self.db_client() as session: - async with session.begin(): - session.add(chunk) - await session.commit() - await session.refresh(chunk) - return chunk - - async def get_chunk(self, chunk_id: str): - - async with self.db_client() as session: - result = await session.execute(select(DataChunk).where(DataChunk.chunk_id == chunk_id)) - chunk = result.scalar_one_or_none() - return chunk - - async def insert_many_chunks(self, chunks: list, batch_size: int=100): - - async with self.db_client() as session: - async with session.begin(): - for i in range(0, len(chunks), batch_size): - batch = chunks[i:i+batch_size] - session.add_all(batch) - await session.commit() - return len(chunks) - - async def delete_chunks_by_project_id(self, project_id: ObjectId): - async with self.db_client() as session: - stmt = delete(DataChunk).where(DataChunk.chunk_project_id == project_id) - result = await session.execute(stmt) - await session.commit() - return result.rowcount - - async def get_poject_chunks(self, project_id: ObjectId, page_no: int=1, page_size: int=50): - async with self.db_client() as session: - stmt = select(DataChunk).where(DataChunk.chunk_project_id == project_id).offset((page_no - 1) * page_size).limit(page_size) - result = await session.execute(stmt) - records = result.scalars().all() - return records - - async def get_total_chunks_count(self, project_id: ObjectId): - total_count = 0 - async with self.db_client() as session: - count_sql = select(func.count(DataChunk.chunk_id)).where(DataChunk.chunk_project_id == project_id) - records_count = await session.execute(count_sql) - total_count = records_count.scalar() - - return total_count - - diff --git a/src/models/ProjectModel.py b/src/models/ProjectModel.py deleted file mode 100644 index c3342af8..00000000 --- a/src/models/ProjectModel.py +++ /dev/null @@ -1,61 +0,0 @@ -from .BaseDataModel import BaseDataModel -from .db_schemes import Project -from .enums.DataBaseEnum import DataBaseEnum -from sqlalchemy.future import select -from sqlalchemy import func - -class ProjectModel(BaseDataModel): - - def __init__(self, db_client: object): - super().__init__(db_client=db_client) - self.db_client = db_client - - @classmethod - async def create_instance(cls, db_client: object): - instance = cls(db_client) - return instance - - async def create_project(self, project: Project): - async with self.db_client() as session: - async with session.begin(): - session.add(project) - await session.commit() - await session.refresh(project) - - return project - - async def get_project_or_create_one(self, project_id: str): - async with self.db_client() as session: - async with session.begin(): - query = select(Project).where(Project.project_id == project_id) - result = await session.execute(query) - project = result.scalar_one_or_none() - if project is None: - project_rec = Project( - project_id = project_id - ) - - project = await self.create_project(project=project_rec) - return project - else: - return project - - async def get_all_projects(self, page: int=1, page_size: int=10): - - async with self.db_client() as session: - async with session.begin(): - - total_documents = await session.execute(select( - func.count( Project.project_id ) - )) - - total_documents = total_documents.scalar_one() - - total_pages = total_documents // page_size - if total_documents % page_size > 0: - total_pages += 1 - - query = select(Project).offset((page - 1) * page_size ).limit(page_size) - projects = await session.execute(query).scalars().all() - - return projects, total_pages diff --git a/src/models/__init__.py b/src/models/__init__.py deleted file mode 100644 index 3b41f8a2..00000000 --- a/src/models/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .enums.ResponseEnums import ResponseSignal -from .enums.ProcessingEnum import ProcessingEnum - diff --git a/src/models/db_schemes/__init__.py b/src/models/db_schemes/__init__.py deleted file mode 100644 index e5301c78..00000000 --- a/src/models/db_schemes/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from models.db_schemes.minirag.schemes import Project, DataChunk, Asset, RetrievedDocument diff --git a/src/models/db_schemes/minirag/.gitignore b/src/models/db_schemes/minirag/.gitignore deleted file mode 100644 index a2462558..00000000 --- a/src/models/db_schemes/minirag/.gitignore +++ /dev/null @@ -1 +0,0 @@ -alembic.ini diff --git a/src/models/db_schemes/minirag/README.md b/src/models/db_schemes/minirag/README.md deleted file mode 100644 index a9437d7f..00000000 --- a/src/models/db_schemes/minirag/README.md +++ /dev/null @@ -1,21 +0,0 @@ -## Run Alembic Migrations - -### Configuration - -```bash -cp alembic.ini.example alembic.ini -``` - -- Update the `alembic.ini` with your database credentials (`sqlalchemy.url`) - -### (Optional) Create a new migration - -```bash -alembic revision --autogenerate -m "Add ..." -``` - -### Upgrade the database - -```bash -alembic upgrade head -``` diff --git a/src/models/db_schemes/minirag/__init__.py b/src/models/db_schemes/minirag/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/models/db_schemes/minirag/alembic.ini.example b/src/models/db_schemes/minirag/alembic.ini.example deleted file mode 100644 index 0e50bdb1..00000000 --- a/src/models/db_schemes/minirag/alembic.ini.example +++ /dev/null @@ -1,117 +0,0 @@ -# A generic, single database configuration. - -[alembic] -# path to migration scripts -# Use forward slashes (/) also on windows to provide an os agnostic path -script_location = alembic - -# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s -# Uncomment the line below if you want the files to be prepended with date and time -# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file -# for all available tokens -# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s - -# sys.path path, will be prepended to sys.path if present. -# defaults to the current working directory. -prepend_sys_path = . - -# timezone to use when rendering the date within the migration file -# as well as the filename. -# If specified, requires the python>=3.9 or backports.zoneinfo library. -# Any required deps can installed by adding `alembic[tz]` to the pip requirements -# string value is passed to ZoneInfo() -# leave blank for localtime -# timezone = - -# max length of characters to apply to the "slug" field -# truncate_slug_length = 40 - -# set to 'true' to run the environment during -# the 'revision' command, regardless of autogenerate -# revision_environment = false - -# set to 'true' to allow .pyc and .pyo files without -# a source .py file to be detected as revisions in the -# versions/ directory -# sourceless = false - -# version location specification; This defaults -# to alembic/versions. When using multiple version -# directories, initial revisions must be specified with --version-path. -# The path separator used here should be the separator specified by "version_path_separator" below. -# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions - -# version path separator; As mentioned above, this is the character used to split -# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. -# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. -# Valid values for version_path_separator are: -# -# version_path_separator = : -# version_path_separator = ; -# version_path_separator = space -# version_path_separator = newline -version_path_separator = os # Use os.pathsep. Default configuration used for new projects. - -# set to 'true' to search source files recursively -# in each "version_locations" directory -# new in Alembic version 1.10 -# recursive_version_locations = false - -# the output encoding used when revision files -# are written from script.py.mako -# output_encoding = utf-8 - -sqlalchemy.url = driver://user:pass@localhost/dbname - - -[post_write_hooks] -# post_write_hooks defines scripts or Python functions that are run -# on newly generated revision scripts. See the documentation for further -# detail and examples - -# format using "black" - use the console_scripts runner, against the "black" entrypoint -# hooks = black -# black.type = console_scripts -# black.entrypoint = black -# black.options = -l 79 REVISION_SCRIPT_FILENAME - -# lint with attempts to fix using "ruff" - use the exec runner, execute a binary -# hooks = ruff -# ruff.type = exec -# ruff.executable = %(here)s/.venv/bin/ruff -# ruff.options = --fix REVISION_SCRIPT_FILENAME - -# Logging configuration -[loggers] -keys = root,sqlalchemy,alembic - -[handlers] -keys = console - -[formatters] -keys = generic - -[logger_root] -level = WARNING -handlers = console -qualname = - -[logger_sqlalchemy] -level = WARNING -handlers = -qualname = sqlalchemy.engine - -[logger_alembic] -level = INFO -handlers = -qualname = alembic - -[handler_console] -class = StreamHandler -args = (sys.stderr,) -level = NOTSET -formatter = generic - -[formatter_generic] -format = %(levelname)-5.5s [%(name)s] %(message)s -datefmt = %H:%M:%S diff --git a/src/models/db_schemes/minirag/alembic/README b/src/models/db_schemes/minirag/alembic/README deleted file mode 100644 index 98e4f9c4..00000000 --- a/src/models/db_schemes/minirag/alembic/README +++ /dev/null @@ -1 +0,0 @@ -Generic single-database configuration. \ No newline at end of file diff --git a/src/models/db_schemes/minirag/alembic/env.py b/src/models/db_schemes/minirag/alembic/env.py deleted file mode 100644 index 7b1c3462..00000000 --- a/src/models/db_schemes/minirag/alembic/env.py +++ /dev/null @@ -1,79 +0,0 @@ -from logging.config import fileConfig - -from sqlalchemy import engine_from_config -from sqlalchemy import pool -from schemes import SQLAlchemyBase - -from alembic import context - -# this is the Alembic Config object, which provides -# access to the values within the .ini file in use. -config = context.config - -# Interpret the config file for Python logging. -# This line sets up loggers basically. -if config.config_file_name is not None: - fileConfig(config.config_file_name) - -# add your model's MetaData object here -# for 'autogenerate' support -# from myapp import mymodel -# target_metadata = mymodel.Base.metadata -target_metadata = SQLAlchemyBase.metadata - -# other values from the config, defined by the needs of env.py, -# can be acquired: -# my_important_option = config.get_main_option("my_important_option") -# ... etc. - - -def run_migrations_offline() -> None: - """Run migrations in 'offline' mode. - - This configures the context with just a URL - and not an Engine, though an Engine is acceptable - here as well. By skipping the Engine creation - we don't even need a DBAPI to be available. - - Calls to context.execute() here emit the given string to the - script output. - - """ - url = config.get_main_option("sqlalchemy.url") - context.configure( - url=url, - target_metadata=target_metadata, - literal_binds=True, - dialect_opts={"paramstyle": "named"}, - ) - - with context.begin_transaction(): - context.run_migrations() - - -def run_migrations_online() -> None: - """Run migrations in 'online' mode. - - In this scenario we need to create an Engine - and associate a connection with the context. - - """ - connectable = engine_from_config( - config.get_section(config.config_ini_section, {}), - prefix="sqlalchemy.", - poolclass=pool.NullPool, - ) - - with connectable.connect() as connection: - context.configure( - connection=connection, target_metadata=target_metadata - ) - - with context.begin_transaction(): - context.run_migrations() - - -if context.is_offline_mode(): - run_migrations_offline() -else: - run_migrations_online() diff --git a/src/models/db_schemes/minirag/alembic/script.py.mako b/src/models/db_schemes/minirag/alembic/script.py.mako deleted file mode 100644 index fbc4b07d..00000000 --- a/src/models/db_schemes/minirag/alembic/script.py.mako +++ /dev/null @@ -1,26 +0,0 @@ -"""${message} - -Revision ID: ${up_revision} -Revises: ${down_revision | comma,n} -Create Date: ${create_date} - -""" -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa -${imports if imports else ""} - -# revision identifiers, used by Alembic. -revision: str = ${repr(up_revision)} -down_revision: Union[str, None] = ${repr(down_revision)} -branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} -depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} - - -def upgrade() -> None: - ${upgrades if upgrades else "pass"} - - -def downgrade() -> None: - ${downgrades if downgrades else "pass"} diff --git a/src/models/db_schemes/minirag/alembic/versions/243ca8b683b0_update_celery_task_executions_table_.py b/src/models/db_schemes/minirag/alembic/versions/243ca8b683b0_update_celery_task_executions_table_.py deleted file mode 100644 index bc504d12..00000000 --- a/src/models/db_schemes/minirag/alembic/versions/243ca8b683b0_update_celery_task_executions_table_.py +++ /dev/null @@ -1,30 +0,0 @@ -"""update celery_task_executions table indexes - -Revision ID: 243ca8b683b0 -Revises: b9f9e870b09b -Create Date: 2025-08-03 23:15:43.860171 - -""" -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa - - -# revision identifiers, used by Alembic. -revision: str = '243ca8b683b0' -down_revision: Union[str, None] = 'b9f9e870b09b' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - - -def upgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.create_index('ixz_task_name_args_celery_hash', 'celery_task_executions', ['task_name', 'task_args_hash', 'celery_task_id'], unique=True) - # ### end Alembic commands ### - - -def downgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.drop_index('ixz_task_name_args_celery_hash', table_name='celery_task_executions') - # ### end Alembic commands ### diff --git a/src/models/db_schemes/minirag/alembic/versions/b9f9e870b09b_create_celery_task_executions_table.py b/src/models/db_schemes/minirag/alembic/versions/b9f9e870b09b_create_celery_task_executions_table.py deleted file mode 100644 index ad406e13..00000000 --- a/src/models/db_schemes/minirag/alembic/versions/b9f9e870b09b_create_celery_task_executions_table.py +++ /dev/null @@ -1,51 +0,0 @@ -"""create celery_task_executions table - -Revision ID: b9f9e870b09b -Revises: fee4cd54bd38 -Create Date: 2025-08-03 22:17:07.184977 - -""" -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa -from sqlalchemy.dialects import postgresql - -# revision identifiers, used by Alembic. -revision: str = 'b9f9e870b09b' -down_revision: Union[str, None] = 'fee4cd54bd38' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - - -def upgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.create_table('celery_task_executions', - sa.Column('execution_id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('task_name', sa.String(length=255), nullable=False), - sa.Column('task_args_hash', sa.String(length=64), nullable=False), - sa.Column('celery_task_id', sa.UUID(), nullable=True), - sa.Column('status', sa.String(length=20), nullable=False), - sa.Column('task_args', postgresql.JSONB(astext_type=sa.Text()), nullable=True), - sa.Column('result', postgresql.JSONB(astext_type=sa.Text()), nullable=True), - sa.Column('started_at', sa.DateTime(timezone=True), nullable=True), - sa.Column('completed_at', sa.DateTime(timezone=True), nullable=True), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), nullable=True), - sa.PrimaryKeyConstraint('execution_id') - ) - op.create_index('ixz_celery_task_id', 'celery_task_executions', ['celery_task_id'], unique=False) - op.create_index('ixz_task_execution_created_at', 'celery_task_executions', ['created_at'], unique=False) - op.create_index('ixz_task_execution_status', 'celery_task_executions', ['status'], unique=False) - op.create_index('ixz_task_name_args_hash', 'celery_task_executions', ['task_name', 'task_args_hash'], unique=True) - # ### end Alembic commands ### - - -def downgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.drop_index('ixz_task_name_args_hash', table_name='celery_task_executions') - op.drop_index('ixz_task_execution_status', table_name='celery_task_executions') - op.drop_index('ixz_task_execution_created_at', table_name='celery_task_executions') - op.drop_index('ixz_celery_task_id', table_name='celery_task_executions') - op.drop_table('celery_task_executions') - # ### end Alembic commands ### diff --git a/src/models/db_schemes/minirag/alembic/versions/fee4cd54bd38_initial_commit.py b/src/models/db_schemes/minirag/alembic/versions/fee4cd54bd38_initial_commit.py deleted file mode 100644 index e72bda01..00000000 --- a/src/models/db_schemes/minirag/alembic/versions/fee4cd54bd38_initial_commit.py +++ /dev/null @@ -1,76 +0,0 @@ -"""Initial Commit - -Revision ID: fee4cd54bd38 -Revises: -Create Date: 2024-12-02 11:21:07.921865 - -""" -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa -from sqlalchemy.dialects import postgresql - -# revision identifiers, used by Alembic. -revision: str = 'fee4cd54bd38' -down_revision: Union[str, None] = None -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - - -def upgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.create_table('projects', - sa.Column('project_id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('project_uuid', sa.UUID(), nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), nullable=True), - sa.PrimaryKeyConstraint('project_id'), - sa.UniqueConstraint('project_uuid') - ) - op.create_table('assets', - sa.Column('asset_id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('asset_uuid', sa.UUID(), nullable=False), - sa.Column('asset_type', sa.String(), nullable=False), - sa.Column('asset_name', sa.String(), nullable=False), - sa.Column('asset_size', sa.Integer(), nullable=False), - sa.Column('asset_config', postgresql.JSONB(astext_type=sa.Text()), nullable=True), - sa.Column('asset_project_id', sa.Integer(), nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), nullable=True), - sa.ForeignKeyConstraint(['asset_project_id'], ['projects.project_id'], ), - sa.PrimaryKeyConstraint('asset_id'), - sa.UniqueConstraint('asset_uuid') - ) - op.create_index('ix_asset_project_id', 'assets', ['asset_project_id'], unique=False) - op.create_index('ix_asset_type', 'assets', ['asset_type'], unique=False) - op.create_table('chunks', - sa.Column('chunk_id', sa.Integer(), autoincrement=True, nullable=False), - sa.Column('chunk_uuid', sa.UUID(), nullable=False), - sa.Column('chunk_text', sa.String(), nullable=False), - sa.Column('chunk_metadata', postgresql.JSONB(astext_type=sa.Text()), nullable=True), - sa.Column('chunk_order', sa.Integer(), nullable=False), - sa.Column('chunk_project_id', sa.Integer(), nullable=False), - sa.Column('chunk_asset_id', sa.Integer(), nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), nullable=True), - sa.ForeignKeyConstraint(['chunk_asset_id'], ['assets.asset_id'], ), - sa.ForeignKeyConstraint(['chunk_project_id'], ['projects.project_id'], ), - sa.PrimaryKeyConstraint('chunk_id'), - sa.UniqueConstraint('chunk_uuid') - ) - op.create_index('ix_chunk_asset_id', 'chunks', ['chunk_asset_id'], unique=False) - op.create_index('ix_chunk_project_id', 'chunks', ['chunk_project_id'], unique=False) - # ### end Alembic commands ### - - -def downgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### - op.drop_index('ix_chunk_project_id', table_name='chunks') - op.drop_index('ix_chunk_asset_id', table_name='chunks') - op.drop_table('chunks') - op.drop_index('ix_asset_type', table_name='assets') - op.drop_index('ix_asset_project_id', table_name='assets') - op.drop_table('assets') - op.drop_table('projects') - # ### end Alembic commands ### diff --git a/src/models/db_schemes/minirag/schemes/__init__.py b/src/models/db_schemes/minirag/schemes/__init__.py deleted file mode 100644 index 478a864d..00000000 --- a/src/models/db_schemes/minirag/schemes/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .minirag_base import SQLAlchemyBase -from .asset import Asset -from .project import Project -from .datachunk import DataChunk, RetrievedDocument -from .celery_task_execution import CeleryTaskExecution diff --git a/src/models/db_schemes/minirag/schemes/asset.py b/src/models/db_schemes/minirag/schemes/asset.py deleted file mode 100644 index bce853bd..00000000 --- a/src/models/db_schemes/minirag/schemes/asset.py +++ /dev/null @@ -1,32 +0,0 @@ -from .minirag_base import SQLAlchemyBase -from sqlalchemy import Column, Integer, DateTime, func, String, ForeignKey -from sqlalchemy.dialects.postgresql import UUID, JSONB -from sqlalchemy.orm import relationship -from sqlalchemy import Index -import uuid - -class Asset(SQLAlchemyBase): - - __tablename__ = "assets" - - asset_id = Column(Integer, primary_key=True, autoincrement=True) - asset_uuid = Column(UUID(as_uuid=True), default=uuid.uuid4, unique=True, nullable=False) - - asset_type = Column(String, nullable=False) - asset_name = Column(String, nullable=False) - asset_size = Column(Integer, nullable=False) - asset_config = Column(JSONB, nullable=True) - - asset_project_id = Column(Integer, ForeignKey("projects.project_id"), nullable=False) - - created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False) - updated_at = Column(DateTime(timezone=True), onupdate=func.now(), nullable=True) - - project = relationship("Project", back_populates="assets") - chunks = relationship("DataChunk", back_populates="asset") - - __table_args__ = ( - Index('ix_asset_project_id', asset_project_id), - Index('ix_asset_type', asset_type), - ) - diff --git a/src/models/db_schemes/minirag/schemes/celery_task_execution.py b/src/models/db_schemes/minirag/schemes/celery_task_execution.py deleted file mode 100644 index 8ef63aa6..00000000 --- a/src/models/db_schemes/minirag/schemes/celery_task_execution.py +++ /dev/null @@ -1,34 +0,0 @@ -from .minirag_base import SQLAlchemyBase -from sqlalchemy import Column, Integer, DateTime, func, String, Text -from sqlalchemy.dialects.postgresql import UUID, JSONB -from sqlalchemy import Index -import uuid - -class CeleryTaskExecution(SQLAlchemyBase): - - __tablename__ = "celery_task_executions" - - execution_id = Column(Integer, primary_key=True, autoincrement=True) - - task_name = Column(String(255), nullable=False) - task_args_hash = Column(String(64), nullable=False) # SHA-256 hash of task arguments - celery_task_id = Column(UUID(as_uuid=True), nullable=True) - - status = Column(String(20), nullable=False, default='PENDING') - - task_args = Column(JSONB, nullable=True) - result = Column(JSONB, nullable=True) - - started_at = Column(DateTime(timezone=True), nullable=True) - completed_at = Column(DateTime(timezone=True), nullable=True) - created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False) - updated_at = Column(DateTime(timezone=True), onupdate=func.now(), nullable=True) - - __table_args__ = ( - Index('ixz_task_name_args_celery_hash', task_name, task_args_hash, celery_task_id, unique=True), - Index('ixz_task_execution_status', status), - Index('ixz_task_execution_created_at', created_at), - Index('ixz_celery_task_id', celery_task_id), - ) - - diff --git a/src/models/db_schemes/minirag/schemes/datachunk.py b/src/models/db_schemes/minirag/schemes/datachunk.py deleted file mode 100644 index 536ca1b2..00000000 --- a/src/models/db_schemes/minirag/schemes/datachunk.py +++ /dev/null @@ -1,36 +0,0 @@ -from .minirag_base import SQLAlchemyBase -from sqlalchemy import Column, Integer, DateTime, func, String, ForeignKey -from sqlalchemy.dialects.postgresql import UUID, JSONB -from sqlalchemy.orm import relationship -from sqlalchemy import Index -from pydantic import BaseModel -import uuid - -class DataChunk(SQLAlchemyBase): - - __tablename__ = "chunks" - - chunk_id = Column(Integer, primary_key=True, autoincrement=True) - chunk_uuid = Column(UUID(as_uuid=True), default=uuid.uuid4, unique=True, nullable=False) - - chunk_text = Column(String, nullable=False) - chunk_metadata = Column(JSONB, nullable=True) - chunk_order = Column(Integer, nullable=False) - - chunk_project_id = Column(Integer, ForeignKey("projects.project_id"), nullable=False) - chunk_asset_id = Column(Integer, ForeignKey("assets.asset_id"), nullable=False) - - created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False) - updated_at = Column(DateTime(timezone=True), onupdate=func.now(), nullable=True) - - project = relationship("Project", back_populates="chunks") - asset = relationship("Asset", back_populates="chunks") - - __table_args__ = ( - Index('ix_chunk_project_id', chunk_project_id), - Index('ix_chunk_asset_id', chunk_asset_id), - ) - -class RetrievedDocument(BaseModel): - text: str - score: float diff --git a/src/models/db_schemes/minirag/schemes/minirag_base.py b/src/models/db_schemes/minirag/schemes/minirag_base.py deleted file mode 100644 index e40835fe..00000000 --- a/src/models/db_schemes/minirag/schemes/minirag_base.py +++ /dev/null @@ -1,2 +0,0 @@ -from sqlalchemy.ext.declarative import declarative_base -SQLAlchemyBase = declarative_base() diff --git a/src/models/db_schemes/minirag/schemes/project.py b/src/models/db_schemes/minirag/schemes/project.py deleted file mode 100644 index c41dcf00..00000000 --- a/src/models/db_schemes/minirag/schemes/project.py +++ /dev/null @@ -1,18 +0,0 @@ -from .minirag_base import SQLAlchemyBase -from sqlalchemy import Column, Integer, DateTime, func -from sqlalchemy.dialects.postgresql import UUID -import uuid -from sqlalchemy.orm import relationship - -class Project(SQLAlchemyBase): - - __tablename__ = "projects" - - project_id = Column(Integer, primary_key=True, autoincrement=True) - project_uuid = Column(UUID(as_uuid=True), default=uuid.uuid4, unique=True, nullable=False) - - created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False) - updated_at = Column(DateTime(timezone=True), onupdate=func.now(), nullable=True) - - chunks = relationship("DataChunk", back_populates="project") - assets = relationship("Asset", back_populates="project") diff --git a/src/models/enums/AssetTypeEnum.py b/src/models/enums/AssetTypeEnum.py deleted file mode 100644 index 0e849ae8..00000000 --- a/src/models/enums/AssetTypeEnum.py +++ /dev/null @@ -1,6 +0,0 @@ -from enum import Enum - -class AssetTypeEnum(Enum): - - FILE = "file" - \ No newline at end of file diff --git a/src/models/enums/DataBaseEnum.py b/src/models/enums/DataBaseEnum.py deleted file mode 100644 index bb11bf71..00000000 --- a/src/models/enums/DataBaseEnum.py +++ /dev/null @@ -1,8 +0,0 @@ -from enum import Enum - -class DataBaseEnum(Enum): - - COLLECTION_PROJECT_NAME = "projects" - COLLECTION_CHUNK_NAME = "chunks" - COLLECTION_ASSET_NAME = "assets" - diff --git a/src/models/enums/ProcessingEnum.py b/src/models/enums/ProcessingEnum.py deleted file mode 100644 index f6f4c3a7..00000000 --- a/src/models/enums/ProcessingEnum.py +++ /dev/null @@ -1,6 +0,0 @@ -from enum import Enum - -class ProcessingEnum(Enum): - - TXT = ".txt" - PDF = ".pdf" diff --git a/src/models/enums/ResponseEnums.py b/src/models/enums/ResponseEnums.py deleted file mode 100644 index aa46a092..00000000 --- a/src/models/enums/ResponseEnums.py +++ /dev/null @@ -1,24 +0,0 @@ -from enum import Enum - -class ResponseSignal(Enum): - - FILE_VALIDATED_SUCCESS = "file_validate_successfully" - FILE_TYPE_NOT_SUPPORTED = "file_type_not_supported" - FILE_SIZE_EXCEEDED = "file_size_exceeded" - FILE_UPLOAD_SUCCESS = "file_upload_success" - FILE_UPLOAD_FAILED = "file_upload_failed" - PROCESSING_SUCCESS = "processing_success" - PROCESSING_FAILED = "processing_failed" - NO_FILES_ERROR = "not_found_files" - FILE_ID_ERROR = "no_file_found_with_this_id" - PROJECT_NOT_FOUND_ERROR = "project_not_found" - INSERT_INTO_VECTORDB_ERROR = "insert_into_vectordb_error" - INSERT_INTO_VECTORDB_SUCCESS = "insert_into_vectordb_success" - VECTORDB_COLLECTION_RETRIEVED = "vectordb_collection_retrieved" - VECTORDB_SEARCH_ERROR = "vectordb_search_error" - VECTORDB_SEARCH_SUCCESS = "vectordb_search_success" - RAG_ANSWER_ERROR = "rag_answer_error" - RAG_ANSWER_SUCCESS = "rag_answer_success" - DATA_PUSH_TASK_READY="data_push_task_ready" - PROCESS_AND_PUSH_WORKFLOW_READY="process_and_push_workflow_ready" - \ No newline at end of file diff --git a/src/models/enums/__init__.py b/src/models/enums/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/requirements.txt b/src/requirements.txt deleted file mode 100644 index cb2b5a90..00000000 --- a/src/requirements.txt +++ /dev/null @@ -1,32 +0,0 @@ -fastapi==0.110.2 -uvicorn[standard]==0.29.0 -python-multipart==0.0.9 -python-dotenv==1.0.1 -pydantic-settings==2.2.1 -aiofiles==23.2.1 -langchain==0.1.20 -PyMuPDF==1.24.3 -motor==3.4.0 -pydantic-mongo==2.3.0 -openai==1.75.0 -cohere==5.5.8 -qdrant-client==1.10.1 -SQLAlchemy==2.0.36 -asyncpg==0.30.0 -alembic==1.14.0 -psycopg2==2.9.10 -pgvector==0.4.0 -nltk==3.9.1 - -# Monitoring and metrics -prometheus-client==0.21.1 -starlette-exporter==0.23.0 -fastapi-health==0.4.0 - -# Task Queue and Background Processing -celery==5.5.3 -redis==6.2.0 -kombu==5.5.4 -billiard==4.2.1 -vine==5.1.0 -flower==2.0.1 \ No newline at end of file diff --git a/src/routes/__init__.py b/src/routes/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/routes/base.py b/src/routes/base.py deleted file mode 100644 index 45d009e9..00000000 --- a/src/routes/base.py +++ /dev/null @@ -1,23 +0,0 @@ -from fastapi import FastAPI, APIRouter, Depends -import os -from helpers.config import get_settings, Settings -from time import sleep -import logging - -logger = logging.getLogger('uvicorn.error') - -base_router = APIRouter( - prefix="/api/v1", - tags=["api_v1"], -) - -@base_router.get("/") -async def welcome(app_settings: Settings = Depends(get_settings)): - - app_name = app_settings.APP_NAME - app_version = app_settings.APP_VERSION - - return { - "app_name": app_name, - "app_version": app_version, - } diff --git a/src/routes/data.py b/src/routes/data.py deleted file mode 100644 index b419c688..00000000 --- a/src/routes/data.py +++ /dev/null @@ -1,136 +0,0 @@ -from fastapi import FastAPI, APIRouter, Depends, UploadFile, status, Request -from fastapi.responses import JSONResponse -import os -from helpers.config import get_settings, Settings -from controllers import DataController, ProjectController, ProcessController -import aiofiles -from models import ResponseSignal -import logging -from .schemes.data import ProcessRequest -from models.ProjectModel import ProjectModel -from models.ChunkModel import ChunkModel -from models.AssetModel import AssetModel -from models.db_schemes import DataChunk, Asset -from models.enums.AssetTypeEnum import AssetTypeEnum -from controllers import NLPController -from tasks.file_processing import process_project_files -from tasks.process_workflow import process_and_push_workflow - -logger = logging.getLogger('uvicorn.error') - -data_router = APIRouter( - prefix="/api/v1/data", - tags=["api_v1", "data"], -) - -@data_router.post("/upload/{project_id}") -async def upload_data(request: Request, project_id: int, file: UploadFile, - app_settings: Settings = Depends(get_settings)): - - - project_model = await ProjectModel.create_instance( - db_client=request.app.db_client - ) - - project = await project_model.get_project_or_create_one( - project_id=project_id - ) - - # validate the file properties - data_controller = DataController() - - is_valid, result_signal = data_controller.validate_uploaded_file(file=file) - - if not is_valid: - return JSONResponse( - status_code=status.HTTP_400_BAD_REQUEST, - content={ - "signal": result_signal - } - ) - - project_dir_path = ProjectController().get_project_path(project_id=project_id) - file_path, file_id = data_controller.generate_unique_filepath( - orig_file_name=file.filename, - project_id=project_id - ) - - try: - async with aiofiles.open(file_path, "wb") as f: - while chunk := await file.read(app_settings.FILE_DEFAULT_CHUNK_SIZE): - await f.write(chunk) - except Exception as e: - - logger.error(f"Error while uploading file: {e}") - - return JSONResponse( - status_code=status.HTTP_400_BAD_REQUEST, - content={ - "signal": ResponseSignal.FILE_UPLOAD_FAILED.value - } - ) - - # store the assets into the database - asset_model = await AssetModel.create_instance( - db_client=request.app.db_client - ) - - asset_resource = Asset( - asset_project_id=project.project_id, - asset_type=AssetTypeEnum.FILE.value, - asset_name=file_id, - asset_size=os.path.getsize(file_path) - ) - - asset_record = await asset_model.create_asset(asset=asset_resource) - - return JSONResponse( - content={ - "signal": ResponseSignal.FILE_UPLOAD_SUCCESS.value, - "file_id": str(asset_record.asset_id), - } - ) - -@data_router.post("/process/{project_id}") -async def process_endpoint(request: Request, project_id: int, process_request: ProcessRequest): - - chunk_size = process_request.chunk_size - overlap_size = process_request.overlap_size - do_reset = process_request.do_reset - - task = process_project_files.delay( - project_id=project_id, - file_id=process_request.file_id, - chunk_size=chunk_size, - overlap_size=overlap_size, - do_reset=do_reset, - ) - - return JSONResponse( - content={ - "signal": ResponseSignal.PROCESSING_SUCCESS.value, - "task_id": task.id - } - ) - -@data_router.post("/process-and-push/{project_id}") -async def process_and_push_endpoint(request: Request, project_id: int, process_request: ProcessRequest): - - chunk_size = process_request.chunk_size - overlap_size = process_request.overlap_size - do_reset = process_request.do_reset - - workflow_task = process_and_push_workflow.delay( - project_id=project_id, - file_id=process_request.file_id, - chunk_size=chunk_size, - overlap_size=overlap_size, - do_reset=do_reset, - ) - - return JSONResponse( - content={ - "signal": ResponseSignal.PROCESS_AND_PUSH_WORKFLOW_READY.value, - "workflow_task_id": workflow_task.id - } - ) diff --git a/src/routes/nlp.py b/src/routes/nlp.py deleted file mode 100644 index ccd273e7..00000000 --- a/src/routes/nlp.py +++ /dev/null @@ -1,139 +0,0 @@ -from fastapi import FastAPI, APIRouter, status, Request -from fastapi.responses import JSONResponse -from routes.schemes.nlp import PushRequest, SearchRequest -from models.ProjectModel import ProjectModel -from models.ChunkModel import ChunkModel -from controllers import NLPController -from models import ResponseSignal -from tqdm.auto import tqdm -from tasks.data_indexing import index_data_content - -import logging - -logger = logging.getLogger('uvicorn.error') - -nlp_router = APIRouter( - prefix="/api/v1/nlp", - tags=["api_v1", "nlp"], -) - -@nlp_router.post("/index/push/{project_id}") -async def index_project(request: Request, project_id: int, push_request: PushRequest): - - task = index_data_content.delay( - project_id=project_id, - do_reset=push_request.do_reset - ) - - return JSONResponse( - content={ - "signal": ResponseSignal.DATA_PUSH_TASK_READY.value, - "task_id": task.id - } - ) - - -@nlp_router.get("/index/info/{project_id}") -async def get_project_index_info(request: Request, project_id: int): - - project_model = await ProjectModel.create_instance( - db_client=request.app.db_client - ) - - project = await project_model.get_project_or_create_one( - project_id=project_id - ) - - nlp_controller = NLPController( - vectordb_client=request.app.vectordb_client, - generation_client=request.app.generation_client, - embedding_client=request.app.embedding_client, - template_parser=request.app.template_parser, - ) - - collection_info = await nlp_controller.get_vector_db_collection_info(project=project) - - return JSONResponse( - content={ - "signal": ResponseSignal.VECTORDB_COLLECTION_RETRIEVED.value, - "collection_info": collection_info - } - ) - -@nlp_router.post("/index/search/{project_id}") -async def search_index(request: Request, project_id: int, search_request: SearchRequest): - - project_model = await ProjectModel.create_instance( - db_client=request.app.db_client - ) - - project = await project_model.get_project_or_create_one( - project_id=project_id - ) - - nlp_controller = NLPController( - vectordb_client=request.app.vectordb_client, - generation_client=request.app.generation_client, - embedding_client=request.app.embedding_client, - template_parser=request.app.template_parser, - ) - - results = await nlp_controller.search_vector_db_collection( - project=project, text=search_request.text, limit=search_request.limit - ) - - if not results: - return JSONResponse( - status_code=status.HTTP_400_BAD_REQUEST, - content={ - "signal": ResponseSignal.VECTORDB_SEARCH_ERROR.value - } - ) - - return JSONResponse( - content={ - "signal": ResponseSignal.VECTORDB_SEARCH_SUCCESS.value, - "results": [ result.dict() for result in results ] - } - ) - -@nlp_router.post("/index/answer/{project_id}") -async def answer_rag(request: Request, project_id: int, search_request: SearchRequest): - - project_model = await ProjectModel.create_instance( - db_client=request.app.db_client - ) - - project = await project_model.get_project_or_create_one( - project_id=project_id - ) - - nlp_controller = NLPController( - vectordb_client=request.app.vectordb_client, - generation_client=request.app.generation_client, - embedding_client=request.app.embedding_client, - template_parser=request.app.template_parser, - ) - - answer, full_prompt, chat_history = await nlp_controller.answer_rag_question( - project=project, - query=search_request.text, - limit=search_request.limit, - ) - - if not answer: - return JSONResponse( - status_code=status.HTTP_400_BAD_REQUEST, - content={ - "signal": ResponseSignal.RAG_ANSWER_ERROR.value - } - ) - - return JSONResponse( - content={ - "signal": ResponseSignal.RAG_ANSWER_SUCCESS.value, - "answer": answer, - "full_prompt": full_prompt, - "chat_history": chat_history - } - ) diff --git a/src/routes/schemes/__init__.py b/src/routes/schemes/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/routes/schemes/data.py b/src/routes/schemes/data.py deleted file mode 100644 index 2d72068b..00000000 --- a/src/routes/schemes/data.py +++ /dev/null @@ -1,8 +0,0 @@ -from pydantic import BaseModel -from typing import Optional - -class ProcessRequest(BaseModel): - file_id: str = None - chunk_size: Optional[int] = 100 - overlap_size: Optional[int] = 20 - do_reset: Optional[int] = 0 diff --git a/src/routes/schemes/nlp.py b/src/routes/schemes/nlp.py deleted file mode 100644 index 57319484..00000000 --- a/src/routes/schemes/nlp.py +++ /dev/null @@ -1,9 +0,0 @@ -from pydantic import BaseModel -from typing import Optional - -class PushRequest(BaseModel): - do_reset: Optional[int] = 0 - -class SearchRequest(BaseModel): - text: str - limit: Optional[int] = 5 diff --git a/src/stores/llm/LLMEnums.py b/src/stores/llm/LLMEnums.py deleted file mode 100644 index bd7c8cfc..00000000 --- a/src/stores/llm/LLMEnums.py +++ /dev/null @@ -1,23 +0,0 @@ -from enum import Enum - -class LLMEnums(Enum): - OPENAI = "OPENAI" - COHERE = "COHERE" - -class OpenAIEnums(Enum): - SYSTEM = "system" - USER = "user" - ASSISTANT = "assistant" - -class CoHereEnums(Enum): - SYSTEM = "SYSTEM" - USER = "USER" - ASSISTANT = "CHATBOT" - - DOCUMENT = "search_document" - QUERY = "search_query" - - -class DocumentTypeEnum(Enum): - DOCUMENT = "document" - QUERY = "query" \ No newline at end of file diff --git a/src/stores/llm/LLMInterface.py b/src/stores/llm/LLMInterface.py deleted file mode 100644 index a86ebde5..00000000 --- a/src/stores/llm/LLMInterface.py +++ /dev/null @@ -1,24 +0,0 @@ -from abc import ABC, abstractmethod - -class LLMInterface(ABC): - - @abstractmethod - def set_generation_model(self, model_id: str): - pass - - @abstractmethod - def set_embedding_model(self, model_id: str, embedding_size: int): - pass - - @abstractmethod - def generate_text(self, prompt: str, chat_history: list=[], max_output_tokens: int=None, - temperature: float = None): - pass - - @abstractmethod - def embed_text(self, text: str, document_type: str = None): - pass - - @abstractmethod - def construct_prompt(self, prompt: str, role: str): - pass diff --git a/src/stores/llm/LLMProviderFactory.py b/src/stores/llm/LLMProviderFactory.py deleted file mode 100644 index 2cd25392..00000000 --- a/src/stores/llm/LLMProviderFactory.py +++ /dev/null @@ -1,27 +0,0 @@ - -from .LLMEnums import LLMEnums -from .providers import OpenAIProvider, CoHereProvider - -class LLMProviderFactory: - def __init__(self, config: dict): - self.config = config - - def create(self, provider: str): - if provider == LLMEnums.OPENAI.value: - return OpenAIProvider( - api_key = self.config.OPENAI_API_KEY, - api_url = self.config.OPENAI_API_URL, - default_input_max_characters=self.config.INPUT_DAFAULT_MAX_CHARACTERS, - default_generation_max_output_tokens=self.config.GENERATION_DAFAULT_MAX_TOKENS, - default_generation_temperature=self.config.GENERATION_DAFAULT_TEMPERATURE - ) - - if provider == LLMEnums.COHERE.value: - return CoHereProvider( - api_key = self.config.COHERE_API_KEY, - default_input_max_characters=self.config.INPUT_DAFAULT_MAX_CHARACTERS, - default_generation_max_output_tokens=self.config.GENERATION_DAFAULT_MAX_TOKENS, - default_generation_temperature=self.config.GENERATION_DAFAULT_TEMPERATURE - ) - - return None diff --git a/src/stores/llm/__init__.py b/src/stores/llm/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/stores/llm/providers/CoHereProvider.py b/src/stores/llm/providers/CoHereProvider.py deleted file mode 100644 index ac0cac4f..00000000 --- a/src/stores/llm/providers/CoHereProvider.py +++ /dev/null @@ -1,101 +0,0 @@ -from ..LLMInterface import LLMInterface -from ..LLMEnums import CoHereEnums, DocumentTypeEnum -import cohere -import logging -from typing import List, Union - -class CoHereProvider(LLMInterface): - - def __init__(self, api_key: str, - default_input_max_characters: int=1000, - default_generation_max_output_tokens: int=1000, - default_generation_temperature: float=0.1): - - self.api_key = api_key - - self.default_input_max_characters = default_input_max_characters - self.default_generation_max_output_tokens = default_generation_max_output_tokens - self.default_generation_temperature = default_generation_temperature - - self.generation_model_id = None - - self.embedding_model_id = None - self.embedding_size = None - - self.client = cohere.Client(api_key=self.api_key) - - self.enums = CoHereEnums - self.logger = logging.getLogger(__name__) - - def set_generation_model(self, model_id: str): - self.generation_model_id = model_id - - def set_embedding_model(self, model_id: str, embedding_size: int): - self.embedding_model_id = model_id - self.embedding_size = embedding_size - - def process_text(self, text: str): - return text[:self.default_input_max_characters].strip() - - def generate_text(self, prompt: str, chat_history: list=[], max_output_tokens: int=None, - temperature: float = None): - - if not self.client: - self.logger.error("CoHere client was not set") - return None - - if not self.generation_model_id: - self.logger.error("Generation model for CoHere was not set") - return None - - max_output_tokens = max_output_tokens if max_output_tokens else self.default_generation_max_output_tokens - temperature = temperature if temperature else self.default_generation_temperature - - response = self.client.chat( - model = self.generation_model_id, - chat_history = chat_history, - message = self.process_text(prompt), - temperature = temperature, - max_tokens = max_output_tokens - ) - - if not response or not response.text: - self.logger.error("Error while generating text with CoHere") - return None - - return response.text - - def embed_text(self, text: Union[str, List[str]], document_type: str = None): - if not self.client: - self.logger.error("CoHere client was not set") - return None - - if isinstance(text, str): - text = [text] - - if not self.embedding_model_id: - self.logger.error("Embedding model for CoHere was not set") - return None - - input_type = CoHereEnums.DOCUMENT - if document_type == DocumentTypeEnum.QUERY: - input_type = CoHereEnums.QUERY - - response = self.client.embed( - model = self.embedding_model_id, - texts = [ self.process_text(t) for t in text ], - input_type = input_type, - embedding_types=['float'], - ) - - if not response or not response.embeddings or not response.embeddings.float: - self.logger.error("Error while embedding text with CoHere") - return None - - return [ f for f in response.embeddings.float ] - - def construct_prompt(self, prompt: str, role: str): - return { - "role": role, - "text": prompt, - } \ No newline at end of file diff --git a/src/stores/llm/providers/OpenAIProvider.py b/src/stores/llm/providers/OpenAIProvider.py deleted file mode 100644 index c1a2b375..00000000 --- a/src/stores/llm/providers/OpenAIProvider.py +++ /dev/null @@ -1,109 +0,0 @@ -from ..LLMInterface import LLMInterface -from ..LLMEnums import OpenAIEnums -from openai import OpenAI -import logging -from typing import List, Union - -class OpenAIProvider(LLMInterface): - - def __init__(self, api_key: str, api_url: str=None, - default_input_max_characters: int=1000, - default_generation_max_output_tokens: int=1000, - default_generation_temperature: float=0.1): - - self.api_key = api_key - self.api_url = api_url - - self.default_input_max_characters = default_input_max_characters - self.default_generation_max_output_tokens = default_generation_max_output_tokens - self.default_generation_temperature = default_generation_temperature - - self.generation_model_id = None - - self.embedding_model_id = None - self.embedding_size = None - - self.client = OpenAI( - api_key = self.api_key, - base_url = self.api_url if self.api_url and len(self.api_url) else None - ) - - self.enums = OpenAIEnums - self.logger = logging.getLogger(__name__) - - def set_generation_model(self, model_id: str): - self.generation_model_id = model_id - - def set_embedding_model(self, model_id: str, embedding_size: int): - self.embedding_model_id = model_id - self.embedding_size = embedding_size - - def process_text(self, text: str): - return text[:self.default_input_max_characters].strip() - - def generate_text(self, prompt: str, chat_history: list=[], max_output_tokens: int=None, - temperature: float = None): - - if not self.client: - self.logger.error("OpenAI client was not set") - return None - - if not self.generation_model_id: - self.logger.error("Generation model for OpenAI was not set") - return None - - max_output_tokens = max_output_tokens if max_output_tokens else self.default_generation_max_output_tokens - temperature = temperature if temperature else self.default_generation_temperature - - chat_history.append( - self.construct_prompt(prompt=prompt, role=OpenAIEnums.USER.value) - ) - - response = self.client.chat.completions.create( - model = self.generation_model_id, - messages = chat_history, - max_tokens = max_output_tokens, - temperature = temperature - ) - - if not response or not response.choices or len(response.choices) == 0 or not response.choices[0].message: - self.logger.error("Error while generating text with OpenAI") - return None - - return response.choices[0].message.content - - - def embed_text(self, text: Union[str, List[str]], document_type: str = None): - - if not self.client: - self.logger.error("OpenAI client was not set") - return None - - if isinstance(text, str): - text = [text] - - if not self.embedding_model_id: - self.logger.error("Embedding model for OpenAI was not set") - return None - - response = self.client.embeddings.create( - model = self.embedding_model_id, - input = text, - ) - - if not response or not response.data or len(response.data) == 0 or not response.data[0].embedding: - self.logger.error("Error while embedding text with OpenAI") - return None - - return [ rec.embedding for rec in response.data ] - - def construct_prompt(self, prompt: str, role: str): - return { - "role": role, - "content": prompt, - } - - - - - diff --git a/src/stores/llm/providers/__init__.py b/src/stores/llm/providers/__init__.py deleted file mode 100644 index 3368d766..00000000 --- a/src/stores/llm/providers/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .CoHereProvider import CoHereProvider -from .OpenAIProvider import OpenAIProvider diff --git a/src/stores/llm/templates/__init__.py b/src/stores/llm/templates/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/stores/llm/templates/locales/__init__.py b/src/stores/llm/templates/locales/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/stores/llm/templates/locales/ar/__init__.py b/src/stores/llm/templates/locales/ar/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/stores/llm/templates/locales/ar/rag.py b/src/stores/llm/templates/locales/ar/rag.py deleted file mode 100644 index 80c18817..00000000 --- a/src/stores/llm/templates/locales/ar/rag.py +++ /dev/null @@ -1,33 +0,0 @@ -from string import Template - -#### RAG PROMPTS #### - -#### System #### - -system_prompt = Template("\n".join([ - "أنت مساعد لتوليد رد للمستخدم.", - "ستحصل على مجموعة من المستندات المرتبطة باستفسار المستخدم.", - "عليك توليد رد بناءً على المستندات المقدمة.", - "تجاهل المستندات التي لا تتعلق باستفسار المستخدم.", - "يمكنك الاعتذار للمستخدم إذا لم تتمكن من توليد رد.", - "عليك توليد الرد بنفس لغة استفسار المستخدم.", - "كن مؤدباً ومحترماً في التعامل مع المستخدم.", - "كن دقيقًا ومختصرًا في ردك. تجنب المعلومات غير الضرورية.", -])) - -#### Document #### -document_prompt = Template( - "\n".join([ - "## المستند رقم: $doc_num", - "### المحتوى: $chunk_text", - ]) -) - -#### Footer #### -footer_prompt = Template("\n".join([ - "بناءً فقط على المستندات المذكورة أعلاه، يرجى توليد إجابة للمستخدم.", - "## السؤال:", - "$query", - "", - "## الإجابة:", -])) \ No newline at end of file diff --git a/src/stores/llm/templates/locales/en/__init__.py b/src/stores/llm/templates/locales/en/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/stores/llm/templates/locales/en/rag.py b/src/stores/llm/templates/locales/en/rag.py deleted file mode 100644 index f784e349..00000000 --- a/src/stores/llm/templates/locales/en/rag.py +++ /dev/null @@ -1,33 +0,0 @@ -from string import Template - -#### RAG PROMPTS #### - -#### System #### - -system_prompt = Template("\n".join([ - "You are an assistant to generate a response for the user.", - "You will be provided by a set of docuemnts associated with the user's query.", - "You have to generate a response based on the documents provided.", - "Ignore the documents that are not relevant to the user's query.", - "You can applogize to the user if you are not able to generate a response.", - "You have to generate response in the same language as the user's query.", - "Be polite and respectful to the user.", - "Be precise and concise in your response. Avoid unnecessary information.", -])) - -#### Document #### -document_prompt = Template( - "\n".join([ - "## Document No: $doc_num", - "### Content: $chunk_text", - ]) -) - -#### Footer #### -footer_prompt = Template("\n".join([ - "Based only on the above documents, please generate an answer for the user.", - "## Question:", - "$query", - "", - "## Answer:", -])) \ No newline at end of file diff --git a/src/stores/llm/templates/template_parser.py b/src/stores/llm/templates/template_parser.py deleted file mode 100644 index 0cee58ee..00000000 --- a/src/stores/llm/templates/template_parser.py +++ /dev/null @@ -1,43 +0,0 @@ -import os - -class TemplateParser: - - def __init__(self, language: str=None, default_language='en'): - self.current_path = os.path.dirname(os.path.abspath(__file__)) - self.default_language = default_language - self.language = None - - self.set_language(language) - - - def set_language(self, language: str): - if not language: - self.language = self.default_language - - language_path = os.path.join(self.current_path, "locales", language) - if os.path.exists(language_path): - self.language = language - else: - self.language = self.default_language - - def get(self, group: str, key: str, vars: dict={}): - if not group or not key: - return None - - group_path = os.path.join(self.current_path, "locales", self.language, f"{group}.py" ) - targeted_language = self.language - if not os.path.exists(group_path): - group_path = os.path.join(self.current_path, "locales", self.default_language, f"{group}.py" ) - targeted_language = self.default_language - - if not os.path.exists(group_path): - return None - - # import group module - module = __import__(f"stores.llm.templates.locales.{targeted_language}.{group}", fromlist=[group]) - - if not module: - return None - - key_attribute = getattr(module, key) - return key_attribute.substitute(vars) diff --git a/src/stores/vectordb/VectorDBEnums.py b/src/stores/vectordb/VectorDBEnums.py deleted file mode 100644 index 783e4d99..00000000 --- a/src/stores/vectordb/VectorDBEnums.py +++ /dev/null @@ -1,25 +0,0 @@ -from enum import Enum - -class VectorDBEnums(Enum): - QDRANT = "QDRANT" - PGVECTOR = "PGVECTOR" - -class DistanceMethodEnums(Enum): - COSINE = "cosine" - DOT = "dot" - -class PgVectorTableSchemeEnums(Enum): - ID = 'id' - TEXT = 'text' - VECTOR = 'vector' - CHUNK_ID = 'chunk_id' - METADATA = 'metadata' - _PREFIX = 'pgvector' - -class PgVectorDistanceMethodEnums(Enum): - COSINE = "vector_cosine_ops" - DOT = "vector_l2_ops" - -class PgVectorIndexTypeEnums(Enum): - HNSW = "hnsw" - IVFFLAT = "ivfflat" diff --git a/src/stores/vectordb/VectorDBInterface.py b/src/stores/vectordb/VectorDBInterface.py deleted file mode 100644 index 19eb4d9d..00000000 --- a/src/stores/vectordb/VectorDBInterface.py +++ /dev/null @@ -1,52 +0,0 @@ -from abc import ABC, abstractmethod -from typing import List -from models.db_schemes import RetrievedDocument - -class VectorDBInterface(ABC): - - @abstractmethod - def connect(self): - pass - - @abstractmethod - def disconnect(self): - pass - - @abstractmethod - def is_collection_existed(self, collection_name: str) -> bool: - pass - - @abstractmethod - def list_all_collections(self) -> List: - pass - - @abstractmethod - def get_collection_info(self, collection_name: str) -> dict: - pass - - @abstractmethod - def delete_collection(self, collection_name: str): - pass - - @abstractmethod - def create_collection(self, collection_name: str, - embedding_size: int, - do_reset: bool = False): - pass - - @abstractmethod - def insert_one(self, collection_name: str, text: str, vector: list, - metadata: dict = None, - record_id: str = None): - pass - - @abstractmethod - def insert_many(self, collection_name: str, texts: list, - vectors: list, metadata: list = None, - record_ids: list = None, batch_size: int = 50): - pass - - @abstractmethod - def search_by_vector(self, collection_name: str, vector: list, limit: int) -> List[RetrievedDocument]: - pass - \ No newline at end of file diff --git a/src/stores/vectordb/VectorDBProviderFactory.py b/src/stores/vectordb/VectorDBProviderFactory.py deleted file mode 100644 index 0705dff7..00000000 --- a/src/stores/vectordb/VectorDBProviderFactory.py +++ /dev/null @@ -1,31 +0,0 @@ -from .providers import QdrantDBProvider, PGVectorProvider -from .VectorDBEnums import VectorDBEnums -from controllers.BaseController import BaseController -from sqlalchemy.orm import sessionmaker - -class VectorDBProviderFactory: - def __init__(self, config, db_client: sessionmaker=None): - self.config = config - self.base_controller = BaseController() - self.db_client = db_client - - def create(self, provider: str): - if provider == VectorDBEnums.QDRANT.value: - qdrant_db_client = self.base_controller.get_database_path(db_name=self.config.VECTOR_DB_PATH) - - return QdrantDBProvider( - db_client=qdrant_db_client, - distance_method=self.config.VECTOR_DB_DISTANCE_METHOD, - default_vector_size=self.config.EMBEDDING_MODEL_SIZE, - index_threshold=self.config.VECTOR_DB_PGVEC_INDEX_THRESHOLD, - ) - - if provider == VectorDBEnums.PGVECTOR.value: - return PGVectorProvider( - db_client=self.db_client, - distance_method=self.config.VECTOR_DB_DISTANCE_METHOD, - default_vector_size=self.config.EMBEDDING_MODEL_SIZE, - index_threshold=self.config.VECTOR_DB_PGVEC_INDEX_THRESHOLD, - ) - - return None diff --git a/src/stores/vectordb/__init__.py b/src/stores/vectordb/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/stores/vectordb/providers/PGVectorProvider.py b/src/stores/vectordb/providers/PGVectorProvider.py deleted file mode 100644 index 137a3046..00000000 --- a/src/stores/vectordb/providers/PGVectorProvider.py +++ /dev/null @@ -1,306 +0,0 @@ -from ..VectorDBInterface import VectorDBInterface -from ..VectorDBEnums import (DistanceMethodEnums, PgVectorTableSchemeEnums, - PgVectorDistanceMethodEnums, PgVectorIndexTypeEnums) -import logging -from typing import List -from models.db_schemes import RetrievedDocument -from sqlalchemy.sql import text as sql_text -import json - -class PGVectorProvider(VectorDBInterface): - - def __init__(self, db_client, default_vector_size: int = 786, - distance_method: str = None, index_threshold: int=100): - - self.db_client = db_client - self.default_vector_size = default_vector_size - - self.index_threshold = index_threshold - - if distance_method == DistanceMethodEnums.COSINE.value: - distance_method = PgVectorDistanceMethodEnums.COSINE.value - elif distance_method == DistanceMethodEnums.DOT.value: - distance_method = PgVectorDistanceMethodEnums.DOT.value - - self.pgvector_table_prefix = PgVectorTableSchemeEnums._PREFIX.value - self.distance_method = distance_method - - self.logger = logging.getLogger("uvicorn") - self.default_index_name = lambda collection_name: f"{collection_name}_vector_idx" - - - async def connect(self): - async with self.db_client() as session: - try: - # Check if vector extension already exists - result = await session.execute(sql_text( - "SELECT 1 FROM pg_extension WHERE extname = 'vector'" - )) - extension_exists = result.scalar_one_or_none() - - if not extension_exists: - # Only create if it doesn't exist - await session.execute(sql_text("CREATE EXTENSION vector")) - await session.commit() - except Exception as e: - # If extension already exists or any other error, just log and continue - self.logger.warning(f"Vector extension setup: {str(e)}") - await session.rollback() - - async def disconnect(self): - pass - - async def is_collection_existed(self, collection_name: str) -> bool: - - record = None - async with self.db_client() as session: - async with session.begin(): - list_tbl = sql_text(f'SELECT * FROM pg_tables WHERE tablename = :collection_name') - results = await session.execute(list_tbl, {"collection_name": collection_name}) - record = results.scalar_one_or_none() - - return record - - async def list_all_collections(self) -> List: - records = [] - async with self.db_client() as session: - async with session.begin(): - list_tbl = sql_text('SELECT tablename FROM pg_tables WHERE tablename LIKE :prefix') - results = await session.execute(list_tbl, {"prefix": self.pgvector_table_prefix}) - records = results.scalars().all() - - return records - - async def get_collection_info(self, collection_name: str) -> dict: - async with self.db_client() as session: - async with session.begin(): - - table_info_sql = sql_text(f''' - SELECT schemaname, tablename, tableowner, tablespace, hasindexes - FROM pg_tables - WHERE tablename = :collection_name - ''') - - count_sql = sql_text(f'SELECT COUNT(*) FROM {collection_name}') - - table_info = await session.execute(table_info_sql, {"collection_name": collection_name}) - record_count = await session.execute(count_sql) - - table_data = table_info.fetchone() - if not table_data: - return None - - return { - "table_info": { - "schemaname": table_data[0], - "tablename": table_data[1], - "tableowner": table_data[2], - "tablespace": table_data[3], - "hasindexes": table_data[4], - }, - "record_count": record_count.scalar_one(), - } - - async def delete_collection(self, collection_name: str): - async with self.db_client() as session: - async with session.begin(): - self.logger.info(f"Deleting collection: {collection_name}") - - delete_sql = sql_text(f'DROP TABLE IF EXISTS {collection_name}') - await session.execute(delete_sql) - await session.commit() - - return True - - async def create_collection(self, collection_name: str, - embedding_size: int, - do_reset: bool = False): - - if do_reset: - _ = await self.delete_collection(collection_name=collection_name) - - is_collection_existed = await self.is_collection_existed(collection_name=collection_name) - if not is_collection_existed: - self.logger.info(f"Creating collection: {collection_name}") - async with self.db_client() as session: - async with session.begin(): - create_sql = sql_text( - f'CREATE TABLE {collection_name} (' - f'{PgVectorTableSchemeEnums.ID.value} bigserial PRIMARY KEY,' - f'{PgVectorTableSchemeEnums.TEXT.value} text, ' - f'{PgVectorTableSchemeEnums.VECTOR.value} vector({embedding_size}), ' - f'{PgVectorTableSchemeEnums.METADATA.value} jsonb DEFAULT \'{{}}\', ' - f'{PgVectorTableSchemeEnums.CHUNK_ID.value} integer, ' - f'FOREIGN KEY ({PgVectorTableSchemeEnums.CHUNK_ID.value}) REFERENCES chunks(chunk_id)' - ')' - ) - await session.execute(create_sql) - await session.commit() - - return True - - return False - - async def is_index_existed(self, collection_name: str) -> bool: - index_name = self.default_index_name(collection_name) - async with self.db_client() as session: - async with session.begin(): - check_sql = sql_text(f""" - SELECT 1 - FROM pg_indexes - WHERE tablename = :collection_name - AND indexname = :index_name - """) - results = await session.execute(check_sql, {"index_name": index_name, "collection_name": collection_name}) - - return bool(results.scalar_one_or_none()) - - async def create_vector_index(self, collection_name: str, - index_type: str = PgVectorIndexTypeEnums.HNSW.value): - is_index_existed = await self.is_index_existed(collection_name=collection_name) - if is_index_existed: - return False - - async with self.db_client() as session: - async with session.begin(): - count_sql = sql_text(f'SELECT COUNT(*) FROM {collection_name}') - result = await session.execute(count_sql) - records_count = result.scalar_one() - - if records_count < self.index_threshold: - return False - - self.logger.info(f"START: Creating vector index for collection: {collection_name}") - - index_name = self.default_index_name(collection_name) - create_idx_sql = sql_text( - f'CREATE INDEX {index_name} ON {collection_name} ' - f'USING {index_type} ({PgVectorTableSchemeEnums.VECTOR.value} {self.distance_method})' - ) - - await session.execute(create_idx_sql) - - self.logger.info(f"END: Created vector index for collection: {collection_name}") - - async def reset_vector_index(self, collection_name: str, - index_type: str = PgVectorIndexTypeEnums.HNSW.value) -> bool: - - index_name = self.default_index_name(collection_name) - async with self.db_client() as session: - async with session.begin(): - drop_sql = sql_text(f'DROP INDEX IF EXISTS {index_name}') - await session.execute(drop_sql) - - return await self.create_vector_index(collection_name=collection_name, index_type=index_type) - - - async def insert_one(self, collection_name: str, text: str, vector: list, - metadata: dict = None, - record_id: str = None): - - is_collection_existed = await self.is_collection_existed(collection_name=collection_name) - if not is_collection_existed: - self.logger.error(f"Can not insert new record to non-existed collection: {collection_name}") - return False - - if not record_id: - self.logger.error(f"Can not insert new record without chunk_id: {collection_name}") - return False - - async with self.db_client() as session: - async with session.begin(): - insert_sql = sql_text(f'INSERT INTO {collection_name} ' - f'({PgVectorTableSchemeEnums.TEXT.value}, {PgVectorTableSchemeEnums.VECTOR.value}, {PgVectorTableSchemeEnums.METADATA.value}, {PgVectorTableSchemeEnums.CHUNK_ID.value}) ' - 'VALUES (:text, :vector, :metadata, :chunk_id)' - ) - - metadata_json = json.dumps(metadata, ensure_ascii=False) if metadata is not None else "{}" - await session.execute(insert_sql, { - 'text': text, - 'vector': "[" + ",".join([ str(v) for v in vector ]) + "]", - 'metadata': metadata_json, - 'chunk_id': record_id - }) - await session.commit() - - await self.create_vector_index(collection_name=collection_name) - - return True - - - async def insert_many(self, collection_name: str, texts: list, - vectors: list, metadata: list = None, - record_ids: list = None, batch_size: int = 50): - - is_collection_existed = await self.is_collection_existed(collection_name=collection_name) - if not is_collection_existed: - self.logger.error(f"Can not insert new records to non-existed collection: {collection_name}") - return False - - if len(vectors) != len(record_ids): - self.logger.error(f"Invalid data items for collection: {collection_name}") - return False - - if not metadata or len(metadata) == 0: - metadata = [None] * len(texts) - - async with self.db_client() as session: - async with session.begin(): - for i in range(0, len(texts), batch_size): - batch_texts = texts[i:i+batch_size] - batch_vectors = vectors[i:i + batch_size] - batch_metadata = metadata[i:i + batch_size] - batch_record_ids = record_ids[i:i + batch_size] - - values = [] - - for _text, _vector, _metadata, _record_id in zip(batch_texts, batch_vectors, batch_metadata, batch_record_ids): - - metadata_json = json.dumps(_metadata, ensure_ascii=False) if _metadata is not None else "{}" - values.append({ - 'text': _text, - 'vector': "[" + ",".join([ str(v) for v in _vector ]) + "]", - 'metadata': metadata_json, - 'chunk_id': _record_id - }) - - batch_insert_sql = sql_text(f'INSERT INTO {collection_name} ' - f'({PgVectorTableSchemeEnums.TEXT.value}, ' - f'{PgVectorTableSchemeEnums.VECTOR.value}, ' - f'{PgVectorTableSchemeEnums.METADATA.value}, ' - f'{PgVectorTableSchemeEnums.CHUNK_ID.value}) ' - f'VALUES (:text, :vector, :metadata, :chunk_id)') - - await session.execute(batch_insert_sql, values) - - await self.create_vector_index(collection_name=collection_name) - - return True - - async def search_by_vector(self, collection_name: str, vector: list, limit: int): - - is_collection_existed = await self.is_collection_existed(collection_name=collection_name) - if not is_collection_existed: - self.logger.error(f"Can not search for records in a non-existed collection: {collection_name}") - return False - - vector = "[" + ",".join([ str(v) for v in vector ]) + "]" - async with self.db_client() as session: - async with session.begin(): - search_sql = sql_text(f'SELECT {PgVectorTableSchemeEnums.TEXT.value} as text, 1 - ({PgVectorTableSchemeEnums.VECTOR.value} <=> :vector) as score' - f' FROM {collection_name}' - ' ORDER BY score DESC ' - f'LIMIT {limit}' - ) - - result = await session.execute(search_sql, {"vector": vector}) - - records = result.fetchall() - - return [ - RetrievedDocument( - text=record.text, - score=record.score - ) - for record in records - ] diff --git a/src/stores/vectordb/providers/QdrantDBProvider.py b/src/stores/vectordb/providers/QdrantDBProvider.py deleted file mode 100644 index 57146481..00000000 --- a/src/stores/vectordb/providers/QdrantDBProvider.py +++ /dev/null @@ -1,152 +0,0 @@ -from qdrant_client import models, QdrantClient -from ..VectorDBInterface import VectorDBInterface -from ..VectorDBEnums import DistanceMethodEnums -import logging -from typing import List -from models.db_schemes import RetrievedDocument - -class QdrantDBProvider(VectorDBInterface): - - def __init__(self, db_client: str, default_vector_size: int = 786, - distance_method: str = None, index_threshold: int=100): - - self.client = None - self.db_client = db_client - self.distance_method = None - self.default_vector_size = default_vector_size - - if distance_method == DistanceMethodEnums.COSINE.value: - self.distance_method = models.Distance.COSINE - elif distance_method == DistanceMethodEnums.DOT.value: - self.distance_method = models.Distance.DOT - - self.logger = logging.getLogger('uvicorn') - - async def connect(self): - self.client = QdrantClient(path=self.db_client) - - async def disconnect(self): - self.client = None - - async def is_collection_existed(self, collection_name: str) -> bool: - return self.client.collection_exists(collection_name=collection_name) - - async def list_all_collections(self) -> List: - return self.client.get_collections() - - def get_collection_info(self, collection_name: str) -> dict: - return self.client.get_collection(collection_name=collection_name) - - async def delete_collection(self, collection_name: str): - if self.is_collection_existed(collection_name): - self.logger.info(f"Deleting collection: {collection_name}") - return self.client.delete_collection(collection_name=collection_name) - - async def create_collection(self, collection_name: str, - embedding_size: int, - do_reset: bool = False): - if do_reset: - _ = self.delete_collection(collection_name=collection_name) - - if not self.is_collection_existed(collection_name): - self.logger.info(f"Creating new Qdrant collection: {collection_name}") - - _ = self.client.create_collection( - collection_name=collection_name, - vectors_config=models.VectorParams( - size=embedding_size, - distance=self.distance_method - ) - ) - - return True - - return False - - async def insert_one(self, collection_name: str, text: str, vector: list, - metadata: dict = None, - record_id: str = None): - - if not self.is_collection_existed(collection_name): - self.logger.error(f"Can not insert new record to non-existed collection: {collection_name}") - return False - - try: - _ = self.client.upload_records( - collection_name=collection_name, - records=[ - models.Record( - id=[record_id], - vector=vector, - payload={ - "text": text, "metadata": metadata - } - ) - ] - ) - except Exception as e: - self.logger.error(f"Error while inserting batch: {e}") - return False - - return True - - async def insert_many(self, collection_name: str, texts: list, - vectors: list, metadata: list = None, - record_ids: list = None, batch_size: int = 50): - - if metadata is None: - metadata = [None] * len(texts) - - if record_ids is None: - record_ids = list(range(0, len(texts))) - - for i in range(0, len(texts), batch_size): - batch_end = i + batch_size - - batch_texts = texts[i:batch_end] - batch_vectors = vectors[i:batch_end] - batch_metadata = metadata[i:batch_end] - batch_record_ids = record_ids[i:batch_end] - - batch_records = [ - models.Record( - id=batch_record_ids[x], - vector=batch_vectors[x], - payload={ - "text": batch_texts[x], "metadata": batch_metadata[x] - } - ) - - for x in range(len(batch_texts)) - ] - - try: - _ = self.client.upload_records( - collection_name=collection_name, - records=batch_records, - ) - except Exception as e: - self.logger.error(f"Error while inserting batch: {e}") - return False - - return True - - async def search_by_vector(self, collection_name: str, vector: list, limit: int = 5): - - results = self.client.search( - collection_name=collection_name, - query_vector=vector, - limit=limit - ) - - if not results or len(results) == 0: - return None - - return [ - RetrievedDocument(**{ - "score": result.score, - "text": result.payload["text"], - }) - for result in results - ] - diff --git a/src/stores/vectordb/providers/__init__.py b/src/stores/vectordb/providers/__init__.py deleted file mode 100644 index 75bfb8e1..00000000 --- a/src/stores/vectordb/providers/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .QdrantDBProvider import QdrantDBProvider -from .PGVectorProvider import PGVectorProvider diff --git a/src/tasks/__init__.py b/src/tasks/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/tasks/data_indexing.py b/src/tasks/data_indexing.py deleted file mode 100644 index a93b5f36..00000000 --- a/src/tasks/data_indexing.py +++ /dev/null @@ -1,144 +0,0 @@ -from celery_app import celery_app, get_setup_utils -from helpers.config import get_settings -import asyncio -from fastapi.responses import JSONResponse -from models.ProjectModel import ProjectModel -from models.ChunkModel import ChunkModel -from controllers import NLPController -from models import ResponseSignal -from tqdm.auto import tqdm - -import logging -logger = logging.getLogger(__name__) - -@celery_app.task( - bind=True, name="tasks.data_indexing.index_data_content", - autoretry_for=(Exception,), - retry_kwargs={'max_retries': 3, 'countdown': 60} - ) -def index_data_content(self, project_id: int, do_reset: int): - - logger.warning("index_data_content started") - return asyncio.run( - _index_data_content(self, project_id, do_reset) - ) - -async def _index_data_content(task_instance, project_id: int, do_reset: int): - - db_engine, vectordb_client = None, None - - try: - - (db_engine, db_client, llm_provider_factory, - vectordb_provider_factory, - generation_client, embedding_client, - vectordb_client, template_parser) = await get_setup_utils() - - logger.warning("Setup utils were loaded!") - - project_model = await ProjectModel.create_instance( - db_client=db_client - ) - - chunk_model = await ChunkModel.create_instance( - db_client=db_client - ) - - project = await project_model.get_project_or_create_one( - project_id=project_id - ) - - if not project: - - task_instance.update_state( - state="FAILURE", - meta={ - "signal": ResponseSignal.PROJECT_NOT_FOUND_ERROR.value - } - ) - - raise Exception(f"No project found for project_id: {project_id}") - - nlp_controller = NLPController( - vectordb_client=vectordb_client, - generation_client=generation_client, - embedding_client=embedding_client, - template_parser=template_parser, - ) - - has_records = True - page_no = 1 - inserted_items_count = 0 - idx = 0 - - # create collection if not exists - collection_name = nlp_controller.create_collection_name(project_id=project.project_id) - - _ = await vectordb_client.create_collection( - collection_name=collection_name, - embedding_size=embedding_client.embedding_size, - do_reset=do_reset, - ) - - # setup batching - total_chunks_count = await chunk_model.get_total_chunks_count(project_id=project.project_id) - pbar = tqdm(total=total_chunks_count, desc="Vector Indexing", position=0) - - while has_records: - page_chunks = await chunk_model.get_poject_chunks(project_id=project.project_id, page_no=page_no) - if len(page_chunks): - page_no += 1 - - if not page_chunks or len(page_chunks) == 0: - has_records = False - break - - chunks_ids = [ c.chunk_id for c in page_chunks ] - idx += len(page_chunks) - - is_inserted = await nlp_controller.index_into_vector_db( - project=project, - chunks=page_chunks, - chunks_ids=chunks_ids - ) - - if not is_inserted: - - - task_instance.update_state( - state="FAILURE", - meta={ - "signal": ResponseSignal.INSERT_INTO_VECTORDB_ERROR.value - } - ) - - raise Exception(f"can not insert into vectorDB | project_id: {project_id}") - - pbar.update(len(page_chunks)) - inserted_items_count += len(page_chunks) - - - task_instance.update_state( - state="SUCCESS", - meta={ - "signal": ResponseSignal.INSERT_INTO_VECTORDB_SUCCESS.value, - } - ) - - return { - "signal": ResponseSignal.INSERT_INTO_VECTORDB_SUCCESS.value, - "inserted_items_count": inserted_items_count - } - - except Exception as e: - logger.error(f"Task failed: {str(e)}") - raise - finally: - try: - if db_engine: - await db_engine.dispose() - - if vectordb_client: - await vectordb_client.disconnect() - except Exception as e: - logger.error(f"Task failed while cleaning: {str(e)}") \ No newline at end of file diff --git a/src/tasks/file_processing.py b/src/tasks/file_processing.py deleted file mode 100644 index 361ce98c..00000000 --- a/src/tasks/file_processing.py +++ /dev/null @@ -1,262 +0,0 @@ -from celery_app import celery_app, get_setup_utils -from helpers.config import get_settings -import asyncio -from models.ProjectModel import ProjectModel -from models.ChunkModel import ChunkModel -from models.AssetModel import AssetModel -from models.db_schemes import DataChunk -from models import ResponseSignal -from models.enums.AssetTypeEnum import AssetTypeEnum -from controllers import ProcessController -from controllers import NLPController -from utils.idempotency_manager import IdempotencyManager - -import logging -logger = logging.getLogger(__name__) - -@celery_app.task( - bind=True, name="tasks.file_processing.process_project_files", - autoretry_for=(Exception,), - retry_kwargs={'max_retries': 3, 'countdown': 60} - ) -def process_project_files(self, project_id: int, - file_id: int, chunk_size: int, - overlap_size: int, do_reset: int): - - return asyncio.run( - _process_project_files(self, project_id, file_id, chunk_size, - overlap_size, do_reset) - ) - - -async def _process_project_files(task_instance, project_id: int, - file_id: int, chunk_size: int, - overlap_size: int, do_reset: int): - - - db_engine, vectordb_client = None, None - - try: - - (db_engine, db_client, llm_provider_factory, - vectordb_provider_factory, - generation_client, embedding_client, - vectordb_client, template_parser) = await get_setup_utils() - - # Create idempotency manager - idempotency_manager = IdempotencyManager(db_client, db_engine) - - # Define task arguments for idempotency check - task_args = { - "project_id": project_id, - "file_id": file_id, - "chunk_size": chunk_size, - "overlap_size": overlap_size, - "do_reset": do_reset - } - - task_name = "tasks.file_processing.process_project_files" - - settings = get_settings() - - # Check if task should execute (600 seconds = 10 minutes timeout) - should_execute, existing_task = await idempotency_manager.should_execute_task( - task_name=task_name, - task_args=task_args, - celery_task_id=task_instance.request.id, - task_time_limit=settings.CELERY_TASK_TIME_LIMIT - ) - - if not should_execute: - logger.warning(f"Can not handle th task | status: {existing_task.status}") - return existing_task.result - - task_record = None - if existing_task: - # Update existing task with new celery task ID - await idempotency_manager.update_task_status( - execution_id=existing_task.execution_id, - status='PENDING' - ) - task_record = existing_task - else: - # Create new task record - task_record = await idempotency_manager.create_task_record( - task_name=task_name, - task_args=task_args, - celery_task_id=task_instance.request.id - ) - - # Update status to STARTED - await idempotency_manager.update_task_status( - execution_id=task_record.execution_id, - status='STARTED' - ) - - - project_model = await ProjectModel.create_instance( - db_client=db_client - ) - - project = await project_model.get_project_or_create_one( - project_id=project_id - ) - - nlp_controller = NLPController( - vectordb_client=vectordb_client, - generation_client=generation_client, - embedding_client=embedding_client, - template_parser=template_parser, - ) - - asset_model = await AssetModel.create_instance( - db_client=db_client - ) - - project_files_ids = {} - if file_id: - asset_record = await asset_model.get_asset_record( - asset_project_id=project.project_id, - asset_name=file_id - ) - - if asset_record is None: - task_instance.update_state( - state="FAILURE", - meta={ - "signal": ResponseSignal.FILE_ID_ERROR.value, - } - ) - - # Update task status to FAILURE - await idempotency_manager.update_task_status( - execution_id=task_record.execution_id, - status='FAILURE', - result={"signal": ResponseSignal.FILE_ID_ERROR.value} - ) - - raise Exception(f"No assets for file: {file_id}") - - project_files_ids = { - asset_record.asset_id: asset_record.asset_name - } - - else: - - - project_files = await asset_model.get_all_project_assets( - asset_project_id=project.project_id, - asset_type=AssetTypeEnum.FILE.value, - ) - - project_files_ids = { - record.asset_id: record.asset_name - for record in project_files - } - - if len(project_files_ids) == 0: - - task_instance.update_state( - state="FAILURE", - meta={ - "signal": ResponseSignal.NO_FILES_ERROR.value, - } - ) - - # Update task status to FAILURE - await idempotency_manager.update_task_status( - execution_id=task_record.execution_id, - status='FAILURE', - result={"signal": ResponseSignal.NO_FILES_ERROR.value,} - ) - - raise Exception(f"No files found for project_id: {project.project_id}") - - process_controller = ProcessController(project_id=project_id) - - no_records = 0 - no_files = 0 - - chunk_model = await ChunkModel.create_instance( - db_client=db_client - ) - - if do_reset == 1: - # delete associated vectors collection - collection_name = nlp_controller.create_collection_name(project_id=project.project_id) - _ = await vectordb_client.delete_collection(collection_name=collection_name) - - # delete associated chunks - _ = await chunk_model.delete_chunks_by_project_id( - project_id=project.project_id - ) - - for asset_id, file_id in project_files_ids.items(): - - file_content = process_controller.get_file_content(file_id=file_id) - - if file_content is None: - logger.error(f"Error while processing file: {file_id}") - continue - - file_chunks = process_controller.process_file_content( - file_content=file_content, - file_id=file_id, - chunk_size=chunk_size, - overlap_size=overlap_size - ) - - if file_chunks is None or len(file_chunks) == 0: - - logger.error(f"No chunks for file_id: {file_id}") - pass - - file_chunks_records = [ - DataChunk( - chunk_text=chunk.page_content, - chunk_metadata=chunk.metadata, - chunk_order=i+1, - chunk_project_id=project.project_id, - chunk_asset_id=asset_id - ) - for i, chunk in enumerate(file_chunks) - ] - - no_records += await chunk_model.insert_many_chunks(chunks=file_chunks_records) - no_files += 1 - - task_instance.update_state( - state="SUCCESS", - meta={ - "signal": ResponseSignal.PROCESSING_SUCCESS.value, - } - ) - - await idempotency_manager.update_task_status( - execution_id=task_record.execution_id, - status='SUCCESS', - result={"signal": ResponseSignal.PROCESSING_SUCCESS.value} - ) - - logger.warning(f"inserted_chunks: {no_records}") - - return { - "signal": ResponseSignal.PROCESSING_SUCCESS.value, - "inserted_chunks": no_records, - "processed_files": no_files, - "project_id": project_id, - "do_reset": do_reset - } - - except Exception as e: - logger.error(f"Task failed: {str(e)}") - raise - finally: - try: - if db_engine: - await db_engine.dispose() - - if vectordb_client: - await vectordb_client.disconnect() - except Exception as e: - logger.error(f"Task failed while cleaning: {str(e)}") \ No newline at end of file diff --git a/src/tasks/maintenance.py b/src/tasks/maintenance.py deleted file mode 100644 index 6b2e94ea..00000000 --- a/src/tasks/maintenance.py +++ /dev/null @@ -1,50 +0,0 @@ -from celery_app import celery_app, get_setup_utils -from helpers.config import get_settings -import asyncio -from utils.idempotency_manager import IdempotencyManager - -import logging -logger = logging.getLogger(__name__) - -@celery_app.task( - bind=True, name="tasks.maintenance.clean_celery_executions_table", - autoretry_for=(Exception,), - retry_kwargs={'max_retries': 3, 'countdown': 60} - ) -def clean_celery_executions_table(self): - - return asyncio.run( - _clean_celery_executions_table(self) - ) - -async def _clean_celery_executions_table(task_instance): - - db_engine, vectordb_client = None, None - - try: - - (db_engine, db_client, llm_provider_factory, - vectordb_provider_factory, - generation_client, embedding_client, - vectordb_client, template_parser) = await get_setup_utils() - - # Create idempotency manager - idempotency_manager = IdempotencyManager(db_client, db_engine) - - logger.warning(f"cleaning !!!") - _ = await idempotency_manager.cleanup_old_tasks(5) - - return True - - except Exception as e: - logger.error(f"Task failed: {str(e)}") - raise - finally: - try: - if db_engine: - await db_engine.dispose() - - if vectordb_client: - await vectordb_client.disconnect() - except Exception as e: - logger.error(f"Task failed while cleaning: {str(e)}") \ No newline at end of file diff --git a/src/tasks/process_workflow.py b/src/tasks/process_workflow.py deleted file mode 100644 index aa77b61f..00000000 --- a/src/tasks/process_workflow.py +++ /dev/null @@ -1,54 +0,0 @@ -from celery import chain -from celery_app import celery_app, get_setup_utils -from helpers.config import get_settings -import asyncio -from tasks.file_processing import process_project_files -from tasks.data_indexing import _index_data_content - -import logging -logger = logging.getLogger(__name__) - -@celery_app.task( - bind=True, name="tasks.process_workflow.push_after_process_task", - autoretry_for=(Exception,), - retry_kwargs={'max_retries': 3, 'countdown': 60} - ) -def push_after_process_task(self, prev_task_result): - - project_id = prev_task_result.get("project_id") - do_reset = prev_task_result.get("do_reset") - - task_results = asyncio.run( - _index_data_content(self, project_id, do_reset) - ) - - return { - "project_id": project_id, - "do_reset": do_reset, - "task_results": task_results - } - - -@celery_app.task( - bind=True, name="tasks.process_workflow.process_and_push_workflow", - autoretry_for=(Exception,), - retry_kwargs={'max_retries': 3, 'countdown': 60} - ) -def process_and_push_workflow( self, project_id: int, - file_id: int, chunk_size: int, - overlap_size: int, do_reset: int): - - workflow = chain( - process_project_files.s(project_id, file_id, chunk_size, overlap_size, do_reset), - push_after_process_task.s() - ) - - result = workflow.apply_async() - - return { - "signal": "WORKFLOW_STARTED", - "workflow_id": result.id, - "tasks": ["tasks.file_processing.process_project_files", - "tasks.data_indexing.index_data_content"] - } - diff --git a/src/utils/__init__.py b/src/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/utils/idempotency_manager.py b/src/utils/idempotency_manager.py deleted file mode 100644 index 127a14de..00000000 --- a/src/utils/idempotency_manager.py +++ /dev/null @@ -1,124 +0,0 @@ -import hashlib -import json -from datetime import datetime, timedelta, timezone -from sqlalchemy import select, delete -from models.db_schemes.minirag.schemes.celery_task_execution import CeleryTaskExecution - -class IdempotencyManager: - - def __init__(self, db_client, db_engine): - self.db_client = db_client - self.db_engine = db_engine - - def create_args_hash(self, task_name: str, task_args: dict): - combined_data = { - **task_args, - "task_name": task_name - } - json_string = json.dumps(combined_data, sort_keys=True, default=str) - return hashlib.sha256(json_string.encode()).hexdigest() - - async def create_task_record(self, task_name: str, task_args: dict, celery_task_id: str = None) -> CeleryTaskExecution: - """Create new task execution record.""" - args_hash = self.create_args_hash(task_name, task_args) - - task_record = CeleryTaskExecution( - task_name=task_name, - task_args_hash=args_hash, - task_args=task_args, - celery_task_id=celery_task_id, - status='PENDING', - started_at=datetime.utcnow() - ) - - session = self.db_client() - try: - session.add(task_record) - await session.commit() - await session.refresh(task_record) - return task_record - finally: - await session.close() - - async def update_task_status(self, execution_id: int, status: str, result: dict = None): - """Update task status and result.""" - session = self.db_client() - try: - task_record = await session.get(CeleryTaskExecution, execution_id) - if task_record: - task_record.status = status - if result: - task_record.result = result - if status in ['SUCCESS', 'FAILURE']: - task_record.completed_at = datetime.utcnow() - await session.commit() - finally: - await session.close() - - async def get_existing_task(self, task_name: str, - task_args: dict, celery_task_id: str) -> CeleryTaskExecution: - """Check if task with same name and args already exists.""" - args_hash = self.create_args_hash(task_name, task_args) - - session = self.db_client() - try: - stmt = select(CeleryTaskExecution).where( - CeleryTaskExecution.celery_task_id == celery_task_id, - CeleryTaskExecution.task_name == task_name, - CeleryTaskExecution.task_args_hash == args_hash - ) - result = await session.execute(stmt) - return result.scalar_one_or_none() - finally: - await session.close() - - async def should_execute_task(self, task_name: str, task_args: dict, - celery_task_id: str, - task_time_limit: int = 600) -> tuple[bool, CeleryTaskExecution]: - """ - Check if task should be executed or return existing result. - Args: - task_time_limit: Time limit in seconds after which a stuck task can be re-executed - Returns (should_execute, existing_task_or_none) - """ - existing_task = await self.get_existing_task(task_name, task_args, celery_task_id) - - if not existing_task: - return True, None - - # Don't execute if task is already completed successfully - if existing_task.status == 'SUCCESS': - return False, existing_task - - # Check if task is stuck (running longer than time limit + 60 seconds) - if existing_task.status in ['PENDING', 'STARTED', 'RETRY']: - if existing_task.started_at: - time_elapsed = (datetime.utcnow() - existing_task.started_at).total_seconds() - time_gap = 60 # 60 seconds grace period - if time_elapsed > (task_time_limit + time_gap): - return True, existing_task # Task is stuck, allow re-execution - return False, existing_task # Task is still running within time limit - - # Re-execute if previous task failed - return True, existing_task - - async def cleanup_old_tasks(self, time_retention: int = 86400) -> int: - """ - Delete old task records older than time_retention seconds. - Args: - time_retention: Time in seconds to retain tasks (default: 86400 = 24 hours) - Returns: - Number of deleted records - """ - cutoff_time = datetime.now(timezone.utc) - timedelta(seconds=time_retention) - - session = self.db_client() - try: - stmt = delete(CeleryTaskExecution).where( - CeleryTaskExecution.created_at < cutoff_time - ) - result = await session.execute(stmt) - await session.commit() - return result.rowcount - finally: - await session.close() \ No newline at end of file diff --git a/src/utils/metrics.py b/src/utils/metrics.py deleted file mode 100644 index 1b189f3d..00000000 --- a/src/utils/metrics.py +++ /dev/null @@ -1,36 +0,0 @@ -from prometheus_client import Counter, Histogram, generate_latest, CONTENT_TYPE_LATEST -from fastapi import FastAPI, Request, Response -from starlette.middleware.base import BaseHTTPMiddleware -import time - -# Define metrics -REQUEST_COUNT = Counter('http_requests_total', 'Total HTTP Requests', ['method', 'endpoint', 'status']) -REQUEST_LATENCY = Histogram('http_request_duration_seconds', 'HTTP Request Latency', ['method', 'endpoint']) - -class PrometheusMiddleware(BaseHTTPMiddleware): - async def dispatch(self, request: Request, call_next): - - start_time = time.time() - - # Process the request - response = await call_next(request) - - # Record metrics after request is processed - duration = time.time() - start_time - endpoint = request.url.path - - REQUEST_LATENCY.labels(method=request.method, endpoint=endpoint).observe(duration) - REQUEST_COUNT.labels(method=request.method, endpoint=endpoint, status=response.status_code).inc() - - return response - -def setup_metrics(app: FastAPI): - """ - Setup Prometheus metrics middleware and endpoint - """ - # Add Prometheus middleware - app.add_middleware(PrometheusMiddleware) - - @app.get("/TrhBVe_m5gg2002_E5VVqS", include_in_schema=False) - def metrics(): - return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST) From 5998360a100bd84bba013b35ca4b5598bbfd0dd4 Mon Sep 17 00:00:00 2001 From: EssamShenhab Date: Sun, 25 Jan 2026 12:06:41 +0200 Subject: [PATCH 59/65] update .gitignore --- .gitignore | 216 +++++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 151 ------------------------------------- 2 files changed, 216 insertions(+), 151 deletions(-) create mode 100644 .gitignore delete mode 100644 README.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..64d49ae3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,216 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +# Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +# poetry.lock +# poetry.toml + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +# pdm.lock +# pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +# pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# Redis +*.rdb +*.aof +*.pid + +# RabbitMQ +mnesia/ +rabbitmq/ +rabbitmq-data/ + +# ActiveMQ +activemq-data/ + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +# .idea/ + +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +# .vscode/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ + +# Streamlit +.streamlit/secrets.toml \ No newline at end of file diff --git a/README.md b/README.md deleted file mode 100644 index d59c47f8..00000000 --- a/README.md +++ /dev/null @@ -1,151 +0,0 @@ -# mini-rag - -This is a minimal implementation of the RAG model for question answering. - -## The Course - -This is an educational project where all of the codes where explained (step by step) via a set of `Arabic` youtube videos. Please check the list: - -| # | Title | Link | Codes | -|---|------------------------------------------|------------------------------------------------------------------------------------------------------|----------------------------------------------------| -| 1 | About the Course ماذا ولمـــاذا | [Video](https://www.youtube.com/watch?v=Vv6e2Rb1Q6w&list=PLvLvlVqNQGHCUR2p0b8a0QpVjDUg50wQj) | NA | -| 2 | What will we build ماذا سنبنى في المشروع | [Video](https://www.youtube.com/watch?v=_l5S5CdxE-Q&list=PLvLvlVqNQGHCUR2p0b8a0QpVjDUg50wQj&index=2) | NA | -| 3 | Setup your tools الأدوات الأساسية | [Video](https://www.youtube.com/watch?v=VSFbkFRAT4w&list=PLvLvlVqNQGHCUR2p0b8a0QpVjDUg50wQj&index=3) | NA | -| 4 | Project Architecture | [Video](https://www.youtube.com/watch?v=Ei_nBwBbFUQ&list=PLvLvlVqNQGHCUR2p0b8a0QpVjDUg50wQj&index=4) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-001) | -| 5 | Welcome to FastAPI | [Video](https://www.youtube.com/watch?v=cpOuCdzN_Mo&list=PLvLvlVqNQGHCUR2p0b8a0QpVjDUg50wQj&index=5) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-002) | -| 6 | Nested Routes + Env Values | [Video](https://www.youtube.com/watch?v=CrR2Bz2Y7Hw&list=PLvLvlVqNQGHCUR2p0b8a0QpVjDUg50wQj&index=6) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-003) | -| 7 | Uploading a File | [Video](https://www.youtube.com/watch?v=5alMKCbFqWs&list=PLvLvlVqNQGHCUR2p0b8a0QpVjDUg50wQj&index=7) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-004) | -| 8 | File Processing | [Video](https://www.youtube.com/watch?v=gQgr2iwtSBw) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-005) | -| 9 | Docker - MongoDB - Motor | [Video](https://www.youtube.com/watch?v=2NOKWm0xJAk) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-006) | -| 10 | Mongo Schemes and Models | [Video](https://www.youtube.com/watch?v=zgcnnMJXXV8) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-007) | -| 11 | Mongo Indexing | [Video](https://www.youtube.com/watch?v=iO8FAmUVcjE) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-008) | -| 12 | Data Pipeline Enhancements | [Video](https://www.youtube.com/watch?v=4x1DuezZBDU) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-008) | -| 13 | Checkpoint-1 | [Video](https://www.youtube.com/watch?v=7xIsZkCisPk) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-008) | -| 14 | LLM Factory | [Video](https://www.youtube.com/watch?v=5TKRIFtIQAY) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-008) | -| 15 | Vector DB Factory | [Video](https://www.youtube.com/watch?v=JtS9UkvF_10) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-009) | -| 16 | Semantic Search | [Video](https://www.youtube.com/watch?v=V3swQKokJW8) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-010) | -| 17 | Augmented Answers | [Video](https://www.youtube.com/watch?v=1Wx8BoM5pLU) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-011) | -| 18 | Checkpoint-1 + Fix Issues | [Video](https://youtu.be/6zG4Idxldvg) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-012) | -| 19 | Ollama Local LLM Server | [Video](https://youtu.be/-epZ1hAAtrs) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-012) | -| 20 | From Mongo to Postgres + SQLAlchemy & Alembic | [Video](https://www.youtube.com/watch?v=BVOq7Ek2Up0) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-013) | -| 21 | The way to PgVector | [Video](https://www.youtube.com/watch?v=g99yq5zlYAE) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-014) | -| 22 | App Deployments 1/2 | [Video](https://www.youtube.com/watch?v=7QRPnAbVssg) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-015) | -| 22 | App Deployments 2/2 | [Video](https://www.youtube.com/watch?v=qJ5Hdyc4hDc) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-015) | -| 24 | Celery Workers 1/2 | [Video](https://www.youtube.com/watch?v=pX-iWWT2TJo) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-016) | -| 25 | Celery Workers 2/2 | [Video](https://www.youtube.com/watch?v=SZ5Aznjf8Kc) | [branch](https://github.com/bakrianoo/mini-rag/tree/tut-017) | - - - - -## Requirements - -- Python 3.10 - -#### Install Dependencies - -```bash -sudo apt update -sudo apt install libpq-dev gcc python3-dev -``` - -#### Install Python using MiniConda - -1) Download and install MiniConda from [here](https://docs.anaconda.com/free/miniconda/#quick-command-line-install) -2) Create a new environment using the following command: -```bash -$ conda create -n mini-rag python=3.10 -``` -3) Activate the environment: -```bash -$ conda activate mini-rag -``` - -### (Optional) Setup you command line interface for better readability - -```bash -export PS1="\[\033[01;32m\]\u@\h:\w\n\[\033[00m\]\$ " -``` - -### (Optional) Run Ollama Local LLM Server using Colab + Ngrok - -- Check the [notebook](https://colab.research.google.com/drive/1KNi3-9KtP-k-93T3wRcmRe37mRmGhL9p?usp=sharing) + [Video](https://youtu.be/-epZ1hAAtrs) - -## Installation - -### Install the required packages - -```bash -$ pip install -r requirements.txt -``` - -### Setup the environment variables - -```bash -$ cp .env.example .env -``` - -### Run Alembic Migration - -```bash -$ alembic upgrade head -``` - -Set your environment variables in the `.env` file. Like `OPENAI_API_KEY` value. - -## Run Docker Compose Services - -```bash -$ cd docker -$ cp .env.example .env -``` - -- update `.env` with your credentials - - - -```bash -$ cd docker -$ sudo docker compose up -d -``` - -## Access Services - -- **FastAPI**: http://localhost:8000 -- **Flower Dashboard**: http://localhost:5555 (admin/password from env) -- **Grafana**: http://localhost:3000 -- **Prometheus**: http://localhost:9090 - -## Run the FastAPI server (Development Mode) - -```bash -$ uvicorn main:app --reload --host 0.0.0.0 --port 5000 -``` - -# Celery (Development Mode) - -For development, you can run Celery services manually instead of using Docker: - -To Run the **Celery worker**, you need to run the following command in a separate terminal: - -```bash -$ python -m celery -A celery_app worker --queues=default,file_processing,data_indexing --loglevel=info -``` - -To run the **Beat scheduler**, you can run the following command in a separate terminal: - -```bash -$ python -m celery -A celery_app beat --loglevel=info -``` - -To Run **Flower Dashboard**, you can run the following command in a separate terminal: - -```bash -$ python -m celery -A celery_app flower --conf=flowerconfig.py -``` - - -open your browser and go to `http://localhost:5555` to see the dashboard. - -## POSTMAN Collection - -Download the POSTMAN collection from [/assets/mini-rag-app.postman_collection.json](/assets/mini-rag-app.postman_collection.json) From 9b0122e1990002fbfca370b2c10b868e3a9f5ffa Mon Sep 17 00:00:00 2001 From: EssamShenhab Date: Sun, 25 Jan 2026 12:59:49 +0200 Subject: [PATCH 60/65] update README and requirements --- README.md | 41 +++++++++++++++++++++++++++++++++++++++++ requirements.txt | 3 +++ 2 files changed, 44 insertions(+) create mode 100644 README.md create mode 100644 requirements.txt diff --git a/README.md b/README.md new file mode 100644 index 00000000..aaa0bb7a --- /dev/null +++ b/README.md @@ -0,0 +1,41 @@ +# mini-rag + +This is a minimal implementation of the RAG model for question answering. + +## Requirements + +- Python 3.8 or later + +#### Install Python using MiniConda + +1) Download and install MiniConda from [here](https://docs.anaconda.com/free/miniconda/#quick-command-line-install) +2) Create a new environment using the following command: +```bash +$ conda create -n mini-rag python=3.8 +``` +3) Activate the environment: +```bash +$ conda activate mini-rag +``` + +### (Optional) Setup you command line interface for better readability + +```bash +export PS1="\[\033[01;32m\]\u@\h:\w\n\[\033[00m\]\$ " +``` + +## Installation + +### Install the required packages + +```bash +$ pip install -r requirements.txt +``` + +### Setup the environment variables + +```bash +$ cp .env.example .env +``` + +Set your environment variables in the `.env` file. Like `OPENAI_API_KEY` value. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..b4a68ab4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +fastapi==0.110.2 +uvicorn[standard]==0.29.0 +python-multipart==0.0.9 \ No newline at end of file From 3c7240a325218ec7f4366af14ce6d9dedd4b111f Mon Sep 17 00:00:00 2001 From: EssamShenhab Date: Sun, 25 Jan 2026 13:13:24 +0200 Subject: [PATCH 61/65] add .env and .gitkeep --- .env.example | 2 ++ assetes/.gitkeep | 0 2 files changed, 2 insertions(+) create mode 100644 .env.example create mode 100644 assetes/.gitkeep diff --git a/.env.example b/.env.example new file mode 100644 index 00000000..2d0f4846 --- /dev/null +++ b/.env.example @@ -0,0 +1,2 @@ +APP_NAME="mini-rag" +APP_VERSION="0.1.0" diff --git a/assetes/.gitkeep b/assetes/.gitkeep new file mode 100644 index 00000000..e69de29b From e2340a9e59301f27c897b23da526872371ad4cf3 Mon Sep 17 00:00:00 2001 From: EssamShenhab Date: Sun, 25 Jan 2026 13:51:10 +0200 Subject: [PATCH 62/65] update instructions --- {assetes => assets}/.gitkeep | 0 assets/mini-rag-app.postman_collection.json | 50 +++++++++++++++++++++ main.py | 7 +++ 3 files changed, 57 insertions(+) rename {assetes => assets}/.gitkeep (100%) create mode 100644 assets/mini-rag-app.postman_collection.json create mode 100644 main.py diff --git a/assetes/.gitkeep b/assets/.gitkeep similarity index 100% rename from assetes/.gitkeep rename to assets/.gitkeep diff --git a/assets/mini-rag-app.postman_collection.json b/assets/mini-rag-app.postman_collection.json new file mode 100644 index 00000000..b006f6bd --- /dev/null +++ b/assets/mini-rag-app.postman_collection.json @@ -0,0 +1,50 @@ +{ + "info": { + "_postman_id": "febe8027-4bc0-4f8e-ba7c-73b5eb97fa50", + "name": "mini-rag-app", + "description": "### Welcome to Postman! This is your first collection. \n\nCollections are your starting point for building and testing APIs. You can use this one to:\n\n• Group related requests\n• Test your API in real-world scenarios\n• Document and share your requests\n\nUpdate the name and overview whenever you’re ready to make it yours.\n\n[Learn more about Postman Collections.](https://learning.postman.com/docs/collections/collections-overview/)", + "schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json", + "_exporter_id": "51802644", + "_collection_link": "https://go.postman.co/collection/51802644-febe8027-4bc0-4f8e-ba7c-73b5eb97fa50?source=collection_link" + }, + "item": [ + { + "name": "welcome-request", + "request": { + "method": "GET", + "header": [] + }, + "response": [] + } + ], + "event": [ + { + "listen": "prerequest", + "script": { + "type": "text/javascript", + "packages": {}, + "requests": {}, + "exec": [ + "" + ] + } + }, + { + "listen": "test", + "script": { + "type": "text/javascript", + "packages": {}, + "requests": {}, + "exec": [ + "" + ] + } + } + ], + "variable": [ + { + "key": "api", + "value": "" + } + ] +} \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 00000000..b14a2165 --- /dev/null +++ b/main.py @@ -0,0 +1,7 @@ +from fastapi import FastAPI + +app = FastAPI() + +@app.get("/welcome") +def welcome(): + return {"message": "Welcome to the FastAPI application!"} From 771027734d4b797aedf99f6a30bd9d9373db2509 Mon Sep 17 00:00:00 2001 From: EssamShenhab Date: Sun, 25 Jan 2026 14:29:37 +0200 Subject: [PATCH 63/65] add fastapi routing --- main.py | 9 ++++++--- requirements.txt | 3 ++- routes/__init__.py | 0 routes/base.py | 15 +++++++++++++++ 4 files changed, 23 insertions(+), 4 deletions(-) create mode 100644 routes/__init__.py create mode 100644 routes/base.py diff --git a/main.py b/main.py index b14a2165..2ee7c944 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,10 @@ from fastapi import FastAPI +from dotenv import load_dotenv + +load_dotenv() + +from routes.base import base_router app = FastAPI() -@app.get("/welcome") -def welcome(): - return {"message": "Welcome to the FastAPI application!"} +app.include_router(base_router) diff --git a/requirements.txt b/requirements.txt index b4a68ab4..3bb443b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ fastapi==0.110.2 uvicorn[standard]==0.29.0 -python-multipart==0.0.9 \ No newline at end of file +python-multipart==0.0.9 +python-dotenv==1.0.1 diff --git a/routes/__init__.py b/routes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/routes/base.py b/routes/base.py new file mode 100644 index 00000000..fbfa4fec --- /dev/null +++ b/routes/base.py @@ -0,0 +1,15 @@ +from fastapi import FastAPI, APIRouter +import os + +base_router = APIRouter( + prefix="/api/v1", + tags=["api_v1"], +) + + +@base_router.get("/") +async def welcome(): + app_name = os.getenv("APP_NAME") + app_version = os.getenv("APP_VERSION") + + return {"app_name": app_name, "app_version": app_version} From c02bcded40cec55e251dfa28e411122c8b78166c Mon Sep 17 00:00:00 2001 From: EssamShenhab Date: Sun, 25 Jan 2026 14:42:20 +0200 Subject: [PATCH 64/65] created src folder --- __pycache__/main.cpython-310.pyc | Bin 0 -> 322 bytes .env.example => src/.env | 0 src/.env.example | 2 ++ src/__pycache__/__init__.cpython-310.pyc | Bin 0 -> 156 bytes src/__pycache__/base.cpython-310.pyc | Bin 0 -> 507 bytes .gitignore => src/assets/.gitignore | 0 {assets => src/assets}/.gitkeep | 0 .../assets}/mini-rag-app.postman_collection.json | 0 main.py => src/main.py | 2 +- requirements.txt => src/requirements.txt | 0 {routes => src/routes}/__init__.py | 0 {routes => src/routes}/base.py | 0 12 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 __pycache__/main.cpython-310.pyc rename .env.example => src/.env (100%) create mode 100644 src/.env.example create mode 100644 src/__pycache__/__init__.cpython-310.pyc create mode 100644 src/__pycache__/base.cpython-310.pyc rename .gitignore => src/assets/.gitignore (100%) rename {assets => src/assets}/.gitkeep (100%) rename {assets => src/assets}/mini-rag-app.postman_collection.json (100%) rename main.py => src/main.py (75%) rename requirements.txt => src/requirements.txt (100%) rename {routes => src/routes}/__init__.py (100%) rename {routes => src/routes}/base.py (100%) diff --git a/__pycache__/main.cpython-310.pyc b/__pycache__/main.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e0ad65cf08fade6962bf3219503b305d919beb5 GIT binary patch literal 322 zcmYjLO-lnY5S=9ZQQU3+#~#{TJW5dn!HWpqaw)NiZ6Mh!$!Z02&2 zqe6}#3SGRHIzJ_M_5@A-g&EM0vAmP9zj?gJv9eyOjq<&24~ag@s5ik6y$%n}Oq4I^ zl${9_FUR8S3+Yp$np|;}E<%>pOm>}En09Li^&9d5p~T$;-G-W5Le!Bv o)0$<{9 literal 0 HcmV?d00001 diff --git a/.env.example b/src/.env similarity index 100% rename from .env.example rename to src/.env diff --git a/src/.env.example b/src/.env.example new file mode 100644 index 00000000..2d0f4846 --- /dev/null +++ b/src/.env.example @@ -0,0 +1,2 @@ +APP_NAME="mini-rag" +APP_VERSION="0.1.0" diff --git a/src/__pycache__/__init__.cpython-310.pyc b/src/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..823105c6b58e1641f0f937eaab728723eead0fae GIT binary patch literal 156 zcmd1j<>g`k0tJq;Oc4DTL?8o3AjbiSi&=m~3PUi1CZpdoGd6Em@nTKW+pR}tl6wHvg6e<*#gGCoLDxW#2&fXA&^YMn57^fMSTil%DiQAXnZ{_{yKH$Y2gKl+i1D0TD<%XRX>dRbVZ&;bClwZvV*E2+p!;p>*#) z%mBsUub&rq-#NSMPTs~8f42lfEc zdw1{qJrQ;L{XwsLav&OxT^yX9AD#9-&_;2d4<>P{M8mOCX_I6V>H#xjgOMuKq|_n3 z^1*b2j>R^=&QisdF>%WGv#B=9@HCkuZ5@x=!|LK+bM1yS{4zFbGrzOpjT#Rr-EH1P z1o%|PI@D1&Nwh*6UeShV586$s-I%&YU>mN&psJ>=M{QLJRYyy(>O!NPG?UY@+NDor SW}RT2MSzIlL+}#?Yw!zzqj`(~ literal 0 HcmV?d00001 diff --git a/.gitignore b/src/assets/.gitignore similarity index 100% rename from .gitignore rename to src/assets/.gitignore diff --git a/assets/.gitkeep b/src/assets/.gitkeep similarity index 100% rename from assets/.gitkeep rename to src/assets/.gitkeep diff --git a/assets/mini-rag-app.postman_collection.json b/src/assets/mini-rag-app.postman_collection.json similarity index 100% rename from assets/mini-rag-app.postman_collection.json rename to src/assets/mini-rag-app.postman_collection.json diff --git a/main.py b/src/main.py similarity index 75% rename from main.py rename to src/main.py index 2ee7c944..7744c27f 100644 --- a/main.py +++ b/src/main.py @@ -3,7 +3,7 @@ load_dotenv() -from routes.base import base_router +from src.routes.base import base_router app = FastAPI() diff --git a/requirements.txt b/src/requirements.txt similarity index 100% rename from requirements.txt rename to src/requirements.txt diff --git a/routes/__init__.py b/src/routes/__init__.py similarity index 100% rename from routes/__init__.py rename to src/routes/__init__.py diff --git a/routes/base.py b/src/routes/base.py similarity index 100% rename from routes/base.py rename to src/routes/base.py From 69f43cbe1e2493d3804ac106c435de1541500263 Mon Sep 17 00:00:00 2001 From: EssamShenhab Date: Wed, 28 Jan 2026 21:13:00 +0200 Subject: [PATCH 65/65] Completed tut-004 --- src/.env | 5 + src/.env.example | 5 + src/.gitignore | 216 ++++++++++++++++++++++++++ src/assets/.gitignore | 217 +-------------------------- src/controllers/BaseController.py | 15 ++ src/controllers/DataController.py | 40 +++++ src/controllers/ProjectController.py | 17 +++ src/controllers/__init__.py | 2 + src/helpers/__init__.py | 0 src/helpers/config.py | 17 +++ src/main.py | 8 +- src/models/__init__.py | 1 + src/models/enums/ResponseEnums.py | 9 ++ src/models/enums/__init__.py | 0 src/requirements.txt | 2 + src/routes/base.py | 12 +- src/routes/data.py | 57 +++++++ 17 files changed, 398 insertions(+), 225 deletions(-) create mode 100644 src/.gitignore create mode 100644 src/controllers/BaseController.py create mode 100644 src/controllers/DataController.py create mode 100644 src/controllers/ProjectController.py create mode 100644 src/controllers/__init__.py create mode 100644 src/helpers/__init__.py create mode 100644 src/helpers/config.py create mode 100644 src/models/__init__.py create mode 100644 src/models/enums/ResponseEnums.py create mode 100644 src/models/enums/__init__.py create mode 100644 src/routes/data.py diff --git a/src/.env b/src/.env index 2d0f4846..ad9f0aa8 100644 --- a/src/.env +++ b/src/.env @@ -1,2 +1,7 @@ APP_NAME="mini-rag" APP_VERSION="0.1.0" +OPENAI_API_KEY="********" + +FILE_ALLOWED_TYPES=["text/plain", "application/pdf"] +FILE_MAX_SIZE=10 +FILE_DEFAULT_CHUNK_SIZE=512000 # 512 KB diff --git a/src/.env.example b/src/.env.example index 2d0f4846..a939daa6 100644 --- a/src/.env.example +++ b/src/.env.example @@ -1,2 +1,7 @@ APP_NAME="mini-rag" APP_VERSION="0.1.0" +OPENAI_API_KEY="" + +FILE_ALLOWED_TYPES= +FILE_MAX_SIZE=10 +FILE_DEFAULT_CHUNK_SIZE=512000 # 512 KB diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 00000000..64d49ae3 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,216 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +# Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +# poetry.lock +# poetry.toml + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +# pdm.lock +# pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +# pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# Redis +*.rdb +*.aof +*.pid + +# RabbitMQ +mnesia/ +rabbitmq/ +rabbitmq-data/ + +# ActiveMQ +activemq-data/ + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +# .idea/ + +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +# .vscode/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ + +# Streamlit +.streamlit/secrets.toml \ No newline at end of file diff --git a/src/assets/.gitignore b/src/assets/.gitignore index 64d49ae3..45afc370 100644 --- a/src/assets/.gitignore +++ b/src/assets/.gitignore @@ -1,216 +1 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[codz] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py.cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -# Pipfile.lock - -# UV -# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# uv.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -# poetry.lock -# poetry.toml - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. -# https://pdm-project.org/en/latest/usage/project/#working-with-version-control -# pdm.lock -# pdm.toml -.pdm-python -.pdm-build/ - -# pixi -# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. -# pixi.lock -# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one -# in the .venv directory. It is recommended not to include this directory in version control. -.pixi - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# Redis -*.rdb -*.aof -*.pid - -# RabbitMQ -mnesia/ -rabbitmq/ -rabbitmq-data/ - -# ActiveMQ -activemq-data/ - -# SageMath parsed files -*.sage.py - -# Environments -.env -.envrc -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -# .idea/ - -# Abstra -# Abstra is an AI-powered process automation framework. -# Ignore directories containing user credentials, local state, and settings. -# Learn more at https://abstra.io/docs -.abstra/ - -# Visual Studio Code -# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore -# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore -# and can be added to the global gitignore or merged into this file. However, if you prefer, -# you could uncomment the following to ignore the entire vscode folder -# .vscode/ - -# Ruff stuff: -.ruff_cache/ - -# PyPI configuration file -.pypirc - -# Marimo -marimo/_static/ -marimo/_lsp/ -__marimo__/ - -# Streamlit -.streamlit/secrets.toml \ No newline at end of file +files \ No newline at end of file diff --git a/src/controllers/BaseController.py b/src/controllers/BaseController.py new file mode 100644 index 00000000..e7901716 --- /dev/null +++ b/src/controllers/BaseController.py @@ -0,0 +1,15 @@ +from helpers.config import get_settings, Settings +import os +import random +import string + + +class BaseController: + def __init__(self): + self.app_settings = get_settings() + self.base_dir = os.path.dirname(os.path.dirname(__file__)) + self.files_dir = os.path.join(self.base_dir, "assets/files") + + def generate_random_string(self, length: int = 12): + return ''.join(random.choices(string.ascii_lowercase + string.digits, k=length)) + \ No newline at end of file diff --git a/src/controllers/DataController.py b/src/controllers/DataController.py new file mode 100644 index 00000000..802e0286 --- /dev/null +++ b/src/controllers/DataController.py @@ -0,0 +1,40 @@ +from .BaseController import BaseController +from .ProjectController import ProjectController +from fastapi import UploadFile +from models import ResponseSignal +import re +import os + +class DataController(BaseController): + + def __init__(self): + super().__init__() + self.size_scale = 1048576 + + def validate_file(self, file: UploadFile): + if file.content_type not in self.app_settings.FILE_ALLOWED_TYPES: + return False, ResponseSignal.FILE_TYPE_NOT_SUPPORTED.value + if file.size > self.app_settings.FILE_MAX_SIZE * self.size_scale: + return False, ResponseSignal.FILE_SIZE_EXCEEDED.value + return True, ResponseSignal.FILE_VALIDATED_SUCCESS.value + + def generate_unique_filename(self, orig_file_name: str, project_id: str): + + random_key = self.generate_random_string() + project_path = ProjectController().get_project_path(project_id=project_id) + clean_file_name = self.get_clean_filename(orig_file_name=orig_file_name) + new_file_path = os.path.join(project_path, random_key + "_" + clean_file_name) + + while os.path.exists(new_file_path): + random_key = self.generate_random_string() + new_file_path = os.path.join(project_path, random_key + "_" + clean_file_name) + + return new_file_path + + + + def get_clean_filename(self, orig_file_name: str): + clean_file_name = re.sub(r'[^\w.]', '', orig_file_name.strip()) + clean_file_name = clean_file_name.replace(" ", "_") + return clean_file_name + \ No newline at end of file diff --git a/src/controllers/ProjectController.py b/src/controllers/ProjectController.py new file mode 100644 index 00000000..4dbcd755 --- /dev/null +++ b/src/controllers/ProjectController.py @@ -0,0 +1,17 @@ +from .BaseController import BaseController +from fastapi import UploadFile +from models import ResponseSignal +import os + + +class ProjectController(BaseController): + + def __init__(self): + super().__init__() + + def get_project_path(self, project_id: str): + project_dir = os.path.join(self.files_dir, project_id) + + if not os.path.exists(project_dir): + os.makedirs(project_dir) + return project_dir diff --git a/src/controllers/__init__.py b/src/controllers/__init__.py new file mode 100644 index 00000000..27959616 --- /dev/null +++ b/src/controllers/__init__.py @@ -0,0 +1,2 @@ +from .DataController import DataController +from .ProjectController import ProjectController \ No newline at end of file diff --git a/src/helpers/__init__.py b/src/helpers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/helpers/config.py b/src/helpers/config.py new file mode 100644 index 00000000..e93f99d5 --- /dev/null +++ b/src/helpers/config.py @@ -0,0 +1,17 @@ +from pydantic_settings import BaseSettings, SettingsConfigDict + +class Settings(BaseSettings): + + APP_NAME: str + APP_VERSION: str + OPENAI_API_KEY: str + + FILE_ALLOWED_TYPES: list + FILE_MAX_SIZE: int + FILE_DEFAULT_CHUNK_SIZE: int + + class Config: + env_file = ".env" + +def get_settings(): + return Settings() diff --git a/src/main.py b/src/main.py index 7744c27f..20e065d6 100644 --- a/src/main.py +++ b/src/main.py @@ -1,10 +1,8 @@ from fastapi import FastAPI -from dotenv import load_dotenv - -load_dotenv() - -from src.routes.base import base_router +from routes.base import base_router +from routes.data import data_router app = FastAPI() app.include_router(base_router) +app.include_router(data_router) diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 00000000..8053833f --- /dev/null +++ b/src/models/__init__.py @@ -0,0 +1 @@ +from .enums.ResponseEnums import ResponseSignal \ No newline at end of file diff --git a/src/models/enums/ResponseEnums.py b/src/models/enums/ResponseEnums.py new file mode 100644 index 00000000..bb28c5e8 --- /dev/null +++ b/src/models/enums/ResponseEnums.py @@ -0,0 +1,9 @@ +from enum import Enum + +class ResponseSignal(Enum): + FILE_TYPE_NOT_SUPPORTED = "file_type_not_supported" + FILE_SIZE_EXCEEDED = "file_size_exceeded" + FILE_UPLOADED_SUCCESS = "file_upload_successfully" + FILE_UPLOADED_FAILED = "file_upload_failed" + FILE_VALIDATED_SUCCESS = "file_validated_successfully" + \ No newline at end of file diff --git a/src/models/enums/__init__.py b/src/models/enums/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/requirements.txt b/src/requirements.txt index 3bb443b0..6a7b0e6d 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -2,3 +2,5 @@ fastapi==0.110.2 uvicorn[standard]==0.29.0 python-multipart==0.0.9 python-dotenv==1.0.1 +pydantic-settings==2.2.1 +aiofiles==23.2.1 diff --git a/src/routes/base.py b/src/routes/base.py index fbfa4fec..1581c726 100644 --- a/src/routes/base.py +++ b/src/routes/base.py @@ -1,6 +1,8 @@ -from fastapi import FastAPI, APIRouter +from fastapi import FastAPI, APIRouter, Depends import os +from helpers.config import get_settings, Settings + base_router = APIRouter( prefix="/api/v1", tags=["api_v1"], @@ -8,8 +10,10 @@ @base_router.get("/") -async def welcome(): - app_name = os.getenv("APP_NAME") - app_version = os.getenv("APP_VERSION") +async def welcome(app_settings: Settings = Depends(get_settings)): + + # app_settings = get_settings() + app_name = app_settings.APP_NAME + app_version = app_settings.APP_VERSION return {"app_name": app_name, "app_version": app_version} diff --git a/src/routes/data.py b/src/routes/data.py new file mode 100644 index 00000000..78d61d12 --- /dev/null +++ b/src/routes/data.py @@ -0,0 +1,57 @@ +from fastapi import FastAPI, APIRouter, Depends, UploadFile, status +from fastapi.responses import JSONResponse +import os +import aiofiles +from helpers.config import get_settings, Settings +from controllers import DataController, ProjectController +from models import ResponseSignal +import logging + +logger = logging.getLogger("uvicorn.error") + +data_router = APIRouter( + prefix="/api/v1/data", + tags=["data_api_v1"], +) + + +@data_router.post("/upload/{project_id}") +async def upload_data( + project_id: str, + file: UploadFile, + app_settings: Settings = Depends(get_settings) +): + + data_controller = DataController() + is_valid, result_signal = data_controller.validate_file(file=file) + if not is_valid: + return JSONResponse( + status_code=status.HTTP_400_BAD_REQUEST, + content={ + "signal": result_signal, + } + ) + + project_dir_path = ProjectController().get_project_path(project_id=project_id) + file_path = data_controller.generate_unique_filename(orig_file_name=file.filename, project_id=project_id) + + try: + async with aiofiles.open(file_path, "wb") as f: + while chunk := await file.read(app_settings.FILE_DEFAULT_CHUNK_SIZE): + await f.write(chunk) + except Exception as e: + + logger.error(f"File uploaded failed: {e}") + + return JSONResponse( + status_code=status.HTTP_400_BAD_REQUEST, + content={ + "signal": ResponseSignal.FILE_UPLOADED_FAILED.value + } + ) + + return JSONResponse( + content={ + "signal": ResponseSignal.FILE_UPLOADED_FAILED.value + } + )