VectifyAI · Shreyansh1729 · Mar 27, 2026 · Mar 30, 2026 · Apr 4, 2026 · Apr 4, 2026
diff --git a/examples/documents/test_issue_163.py b/examples/documents/test_issue_163.py
@@ -0,0 +1,135 @@
+import pytest
+import sys
+import os
+from unittest.mock import patch, MagicMock
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from pageindex.page_index import (
+    check_if_toc_extraction_is_complete,
+    check_if_toc_transformation_is_complete,
+    toc_detector_single_page,
+    detect_page_index,
+    extract_toc_content,
+    toc_transformer,
+)
+
+
+class TestRobustKeyAccess:
+    @patch("pageindex.page_index.llm_completion", return_value="")
+    def test_toc_detector_empty_response(self, mock_llm):
+        result = toc_detector_single_page("some content", model="test")
+        assert result == "no"
+
+    @patch("pageindex.page_index.llm_completion", return_value='{"toc_detected": "yes"}')
+    def test_toc_detector_valid_response(self, mock_llm):
+        result = toc_detector_single_page("some content", model="test")
+        assert result == "yes"
+
+    @patch("pageindex.page_index.llm_completion", return_value="not json at all")
+    def test_toc_detector_malformed_response(self, mock_llm):
+        result = toc_detector_single_page("some content", model="test")
+        assert result == "no"
+
+    @patch("pageindex.page_index.llm_completion", return_value="")
+    def test_extraction_complete_empty_response(self, mock_llm):
+        result = check_if_toc_extraction_is_complete("doc", "toc", model="test")
+        assert result == "no"
+
+    @patch("pageindex.page_index.llm_completion", return_value='{"completed": "yes"}')
+    def test_extraction_complete_valid_response(self, mock_llm):
+        result = check_if_toc_extraction_is_complete("doc", "toc", model="test")
+        assert result == "yes"
+
+    @patch("pageindex.page_index.llm_completion", return_value="")
+    def test_transformation_complete_empty_response(self, mock_llm):
+        result = check_if_toc_transformation_is_complete("raw", "cleaned", model="test")
+        assert result == "no"
+
+    @patch("pageindex.page_index.llm_completion", return_value='{"thinking": "looks fine", "completed": "yes"}')
+    def test_transformation_complete_valid_response(self, mock_llm):
+        result = check_if_toc_transformation_is_complete("raw", "cleaned", model="test")
+        assert result == "yes"
+
+    @patch("pageindex.page_index.llm_completion", return_value="")
+    def test_detect_page_index_empty_response(self, mock_llm):
+        result = detect_page_index("toc text", model="test")
+        assert result == "no"
+
+
+class TestExtractTocContentRetryLoop:
+    @patch("pageindex.page_index.check_if_toc_transformation_is_complete")
+    @patch("pageindex.page_index.llm_completion")
+    def test_completes_on_first_try(self, mock_llm, mock_check):
+        mock_llm.return_value = ("full toc content", "finished")
+        mock_check.return_value = "yes"
+        result = extract_toc_content("raw content", model="test")
+        assert result == "full toc content"
+        assert mock_llm.call_count == 1
+
+    @patch("pageindex.page_index.check_if_toc_transformation_is_complete")
+    @patch("pageindex.page_index.llm_completion")
+    def test_continues_on_incomplete(self, mock_llm, mock_check):
+        mock_llm.side_effect = [
+            ("partial toc", "max_output_reached"),
+            (" continued toc", "finished"),
+        ]
+        mock_check.side_effect = ["no", "yes"]
+        result = extract_toc_content("raw content", model="test")
+        assert result == "partial toc continued toc"
+        assert mock_llm.call_count == 2
+
+    @patch("pageindex.page_index.check_if_toc_transformation_is_complete")
+    @patch("pageindex.page_index.llm_completion")
+    def test_max_retries_raises_exception(self, mock_llm, mock_check):
+        mock_llm.return_value = ("chunk", "max_output_reached")
+        mock_check.return_value = "no"
+        with pytest.raises(Exception, match="Failed to complete table of contents extraction"):
+            extract_toc_content("raw content", model="test")
+        assert mock_llm.call_count == 6
+
+    @patch("pageindex.page_index.check_if_toc_transformation_is_complete")
+    @patch("pageindex.page_index.llm_completion")
+    def test_chat_history_grows_incrementally(self, mock_llm, mock_check):
+        call_count = [0]
+
+        def side_effect(*args, **kwargs):
+            call_count[0] += 1
+            if call_count[0] == 1:
+                return ("initial", "max_output_reached")
+            if call_count[0] == 2:
+                history = kwargs.get("chat_history", [])
+                assert len(history) == 2
+                return (" part2", "max_output_reached")
+            if call_count[0] == 3:
+                history = kwargs.get("chat_history", [])
+                assert len(history) == 4
+                return (" part3", "finished")
+            return ("", "finished")
+
+        mock_llm.side_effect = side_effect
+        mock_check.side_effect = ["no", "no", "yes"]
+        result = extract_toc_content("raw content", model="test")
+        assert result == "initial part2 part3"
+
+
+class TestTocTransformerRetryLoop:
+    @patch("pageindex.page_index.check_if_toc_transformation_is_complete")
+    @patch("pageindex.page_index.llm_completion")
+    def test_completes_on_first_try(self, mock_llm, mock_check):
+        mock_llm.return_value = (
+            '{"table_of_contents": [{"structure": "1", "title": "Intro", "page": 1}]}',
+            "finished",
+        )
+        mock_check.return_value = "yes"
+        result = toc_transformer("raw toc", model="test")
+        assert len(result) == 1
+        assert result[0]["title"] == "Intro"
+
+    @patch("pageindex.page_index.check_if_toc_transformation_is_complete")
+    @patch("pageindex.page_index.llm_completion")
+    def test_handles_missing_table_of_contents_key(self, mock_llm, mock_check):
+        mock_llm.return_value = ('{"other_key": "value"}', "finished")
+        mock_check.return_value = "yes"
+        result = toc_transformer("raw toc", model="test")
+        assert result == []
diff --git a/examples/documents/test_multi_doc.py b/examples/documents/test_multi_doc.py
@@ -0,0 +1,78 @@
+import json
+import pytest
+from unittest.mock import patch, MagicMock
+from pageindex.retrieve import get_document, get_document_structure, get_page_content
+
+@pytest.fixture
+def mock_documents():
+    return {
+        "doc1": {
+            "id": "doc1",
+            "path": "test1.pdf",
+            "type": "pdf",
+            "doc_name": "Document 1",
+            "doc_description": "First test doc",
+            "structure": [{"title": "Section 1", "page": 1, "text": "Content 1"}]
+        },
+        "doc2": {
+            "id": "doc2",
+            "path": "test2.md",
+            "type": "md",
+            "doc_name": "Document 2",
+            "doc_description": "Second test doc",
+            "structure": [{"title": "Header 2", "line_num": 1, "text": "Content 2"}]
+        }
+    }
+
+def test_get_document_multi(mock_documents):
+    with patch("pageindex.retrieve._count_pages", return_value=5):
+        result_json = get_document(mock_documents, ["doc1", "doc2"])
+        result = json.loads(result_json)
+
+        assert "doc1" in result
+        assert "doc2" in result
+        assert result["doc1"]["doc_name"] == "Document 1"
+        assert result["doc1"]["page_count"] == 5
+        assert result["doc2"]["line_count"] == 5
+
+def test_get_document_structure_multi(mock_documents):
+    result_json = get_document_structure(mock_documents, ["doc1", "doc2"])
+    result = json.loads(result_json)
+
+    assert "doc1" in result
+    assert "doc2" in result
+    # Verify text field is removed
+    assert "text" not in result["doc1"][0]
+    assert result["doc1"][0]["title"] == "Section 1"
+
+def test_get_page_content_multi(mock_documents):
+    with patch("pageindex.retrieve._get_pdf_page_content", return_value=[{"page": 1, "content": "PDF Content"}]), \
+         patch("pageindex.retrieve._get_md_page_content", return_value=[{"page": 1, "content": "MD Content"}]):
+
+        result_json = get_page_content(mock_documents, ["doc1", "doc2"], "1")
+        result = json.loads(result_json)
+
+        assert "doc1" in result
+        assert "doc2" in result
+        assert result["doc1"][0]["content"] == "PDF Content"
+        assert result["doc2"][0]["content"] == "MD Content"
+
+def test_get_document_multi_with_invalid_id(mock_documents):
+    with patch("pageindex.retrieve._count_pages", return_value=5):
+        result_json = get_document(mock_documents, ["doc1", "invalid-id"])
+        result = json.loads(result_json)
+
+        assert "doc1" in result
+        assert "invalid-id" in result
+        assert "error" in result["invalid-id"]
+        assert "not found" in result["invalid-id"]["error"]
+
+def test_backward_compatibility(mock_documents):
+    # Single doc_id as string should return flat result, not nested
+    with patch("pageindex.retrieve._count_pages", return_value=5):
+        result_json = get_document(mock_documents, "doc1")
+        result = json.loads(result_json)
+
+        assert "doc_id" in result
+        assert result["doc_id"] == "doc1"
+        assert "doc1" not in result # Should not be nested
diff --git a/pageindex/client.py b/pageindex/client.py
@@ -217,18 +217,26 @@ def _ensure_doc_loaded(self, doc_id: str):
         if full.get('pages'):
             doc['pages'] = full['pages']
 
-    def get_document(self, doc_id: str) -> str:
-        """Return document metadata JSON."""
+    def get_document(self, doc_id) -> str:
+        """Return document metadata JSON. doc_id can be a string or a list of strings."""
         return get_document(self.documents, doc_id)
 
-    def get_document_structure(self, doc_id: str) -> str:
-        """Return document tree structure JSON (without text fields)."""
+    def get_document_structure(self, doc_id) -> str:
+        """Return document tree structure JSON (without text fields). doc_id can be a string or a list of strings."""
         if self.workspace:
-            self._ensure_doc_loaded(doc_id)
+            if isinstance(doc_id, list):
+                for d_id in doc_id:
+                    self._ensure_doc_loaded(d_id)
+            else:
+                self._ensure_doc_loaded(doc_id)
         return get_document_structure(self.documents, doc_id)
 
-    def get_page_content(self, doc_id: str, pages: str) -> str:
-        """Return page content for the given pages string (e.g. '5-7', '3,8', '12')."""
+    def get_page_content(self, doc_id, pages: str) -> str:
+        """Return page content for the given pages string (e.g. '5-7', '3,8', '12'). doc_id can be a string or a list of strings."""
         if self.workspace:
-            self._ensure_doc_loaded(doc_id)
+            if isinstance(doc_id, list):
+                for d_id in doc_id:
+                    self._ensure_doc_loaded(d_id)
+            else:
+                self._ensure_doc_loaded(doc_id)
         return get_page_content(self.documents, doc_id, pages)