diff --git a/vlmeval/api/__init__.py b/vlmeval/api/__init__.py index 82ae29d50..f72fca2f1 100644 --- a/vlmeval/api/__init__.py +++ b/vlmeval/api/__init__.py @@ -29,6 +29,7 @@ from .together import TogetherAPI from .gcp_vertex import GCPVertexAPI from .bedrock import BedrockAPI +from .deepocr_api import DeepOCRAPI __all__ = [ 'OpenAIWrapper', 'HFChatModel', 'GeminiWrapper', 'GPT4V', 'Gemini', @@ -40,4 +41,5 @@ 'RBdashMMChat3_API', 'RBdashChat3_5_API', 'RBdashMMChat3_78B_API', 'RBdashMMChat3_5_38B_API', 'VideoChatOnlineV2API', 'TeleMM2_API', 'TeleMM2Thinking_API', 'TogetherAPI', 'GCPVertexAPI', 'BedrockAPI', + 'DeepOCRAPI', ] diff --git a/vlmeval/api/deepocr_api.py b/vlmeval/api/deepocr_api.py new file mode 100644 index 000000000..3207bf01c --- /dev/null +++ b/vlmeval/api/deepocr_api.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +import os + +from .gpt import OpenAIWrapper + + +class DeepOCRAPI(OpenAIWrapper): + """OpenAI-compatible API wrapper for DeepOCR pipeline endpoint. + + Credentials and endpoint are provided only via environment variables: + - DEEPOCR_API_BASE + - DEEPOCR_API_KEY + """ + + is_api: bool = True + + def __init__( + self, + model: str = "deepocr", + retry: int = 5, + verbose: bool = False, + system_prompt: str | None = None, + temperature: float = 0, + timeout: int = 300, + max_tokens: int = 2048, + img_size: int = -1, + img_detail: str = "high", + **kwargs, + ): + api_base = os.getenv("DEEPOCR_API_BASE", "") + api_key = os.getenv("DEEPOCR_API_KEY", "") + if not api_base or not api_key: + raise ValueError( + "DEEPOCR_API_BASE and DEEPOCR_API_KEY must be set in the environment." + ) + + super().__init__( + model=model, + retry=retry, + key=api_key, + verbose=verbose, + system_prompt=system_prompt, + temperature=temperature, + timeout=timeout, + api_base=api_base, + max_tokens=max_tokens, + img_size=img_size, + img_detail=img_detail, + **kwargs, + ) diff --git a/vlmeval/config.py b/vlmeval/config.py index 53ad5c7f3..2f69c7cb2 100644 --- a/vlmeval/config.py +++ b/vlmeval/config.py @@ -144,6 +144,15 @@ retry=10, verbose=False, ), + "DEEPOCR": partial( + DeepOCRAPI, + model="gpt-4-1106-vision-preview", + temperature=0, + img_size=-1, + img_detail="high", + retry=10, + verbose=False, + ), "GPT4V_20240409": partial( GPT4V, model="gpt-4-turbo-2024-04-09",