Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 135 additions & 0 deletions examples/documents/test_issue_163.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import pytest
import sys
import os
from unittest.mock import patch, MagicMock

sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from pageindex.page_index import (
check_if_toc_extraction_is_complete,
check_if_toc_transformation_is_complete,
toc_detector_single_page,
detect_page_index,
extract_toc_content,
toc_transformer,
)


class TestRobustKeyAccess:
@patch("pageindex.page_index.llm_completion", return_value="")
def test_toc_detector_empty_response(self, mock_llm):
result = toc_detector_single_page("some content", model="test")
assert result == "no"

@patch("pageindex.page_index.llm_completion", return_value='{"toc_detected": "yes"}')
def test_toc_detector_valid_response(self, mock_llm):
result = toc_detector_single_page("some content", model="test")
assert result == "yes"

@patch("pageindex.page_index.llm_completion", return_value="not json at all")
def test_toc_detector_malformed_response(self, mock_llm):
result = toc_detector_single_page("some content", model="test")
assert result == "no"

@patch("pageindex.page_index.llm_completion", return_value="")
def test_extraction_complete_empty_response(self, mock_llm):
result = check_if_toc_extraction_is_complete("doc", "toc", model="test")
assert result == "no"

@patch("pageindex.page_index.llm_completion", return_value='{"completed": "yes"}')
def test_extraction_complete_valid_response(self, mock_llm):
result = check_if_toc_extraction_is_complete("doc", "toc", model="test")
assert result == "yes"

@patch("pageindex.page_index.llm_completion", return_value="")
def test_transformation_complete_empty_response(self, mock_llm):
result = check_if_toc_transformation_is_complete("raw", "cleaned", model="test")
assert result == "no"

@patch("pageindex.page_index.llm_completion", return_value='{"thinking": "looks fine", "completed": "yes"}')
def test_transformation_complete_valid_response(self, mock_llm):
result = check_if_toc_transformation_is_complete("raw", "cleaned", model="test")
assert result == "yes"

@patch("pageindex.page_index.llm_completion", return_value="")
def test_detect_page_index_empty_response(self, mock_llm):
result = detect_page_index("toc text", model="test")
assert result == "no"


class TestExtractTocContentRetryLoop:
@patch("pageindex.page_index.check_if_toc_transformation_is_complete")
@patch("pageindex.page_index.llm_completion")
def test_completes_on_first_try(self, mock_llm, mock_check):
mock_llm.return_value = ("full toc content", "finished")
mock_check.return_value = "yes"
result = extract_toc_content("raw content", model="test")
assert result == "full toc content"
assert mock_llm.call_count == 1

@patch("pageindex.page_index.check_if_toc_transformation_is_complete")
@patch("pageindex.page_index.llm_completion")
def test_continues_on_incomplete(self, mock_llm, mock_check):
mock_llm.side_effect = [
("partial toc", "max_output_reached"),
(" continued toc", "finished"),
]
mock_check.side_effect = ["no", "yes"]
result = extract_toc_content("raw content", model="test")
assert result == "partial toc continued toc"
assert mock_llm.call_count == 2

@patch("pageindex.page_index.check_if_toc_transformation_is_complete")
@patch("pageindex.page_index.llm_completion")
def test_max_retries_raises_exception(self, mock_llm, mock_check):
mock_llm.return_value = ("chunk", "max_output_reached")
mock_check.return_value = "no"
with pytest.raises(Exception, match="Failed to complete table of contents extraction"):
extract_toc_content("raw content", model="test")
assert mock_llm.call_count == 6

@patch("pageindex.page_index.check_if_toc_transformation_is_complete")
@patch("pageindex.page_index.llm_completion")
def test_chat_history_grows_incrementally(self, mock_llm, mock_check):
call_count = [0]

def side_effect(*args, **kwargs):
call_count[0] += 1
if call_count[0] == 1:
return ("initial", "max_output_reached")
if call_count[0] == 2:
history = kwargs.get("chat_history", [])
assert len(history) == 2
return (" part2", "max_output_reached")
if call_count[0] == 3:
history = kwargs.get("chat_history", [])
assert len(history) == 4
return (" part3", "finished")
return ("", "finished")

mock_llm.side_effect = side_effect
mock_check.side_effect = ["no", "no", "yes"]
result = extract_toc_content("raw content", model="test")
assert result == "initial part2 part3"


class TestTocTransformerRetryLoop:
@patch("pageindex.page_index.check_if_toc_transformation_is_complete")
@patch("pageindex.page_index.llm_completion")
def test_completes_on_first_try(self, mock_llm, mock_check):
mock_llm.return_value = (
'{"table_of_contents": [{"structure": "1", "title": "Intro", "page": 1}]}',
"finished",
)
mock_check.return_value = "yes"
result = toc_transformer("raw toc", model="test")
assert len(result) == 1
assert result[0]["title"] == "Intro"

@patch("pageindex.page_index.check_if_toc_transformation_is_complete")
@patch("pageindex.page_index.llm_completion")
def test_handles_missing_table_of_contents_key(self, mock_llm, mock_check):
mock_llm.return_value = ('{"other_key": "value"}', "finished")
mock_check.return_value = "yes"
result = toc_transformer("raw toc", model="test")
assert result == []
78 changes: 78 additions & 0 deletions examples/documents/test_multi_doc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import json
import pytest
from unittest.mock import patch, MagicMock
from pageindex.retrieve import get_document, get_document_structure, get_page_content

@pytest.fixture
def mock_documents():
return {
"doc1": {
"id": "doc1",
"path": "test1.pdf",
"type": "pdf",
"doc_name": "Document 1",
"doc_description": "First test doc",
"structure": [{"title": "Section 1", "page": 1, "text": "Content 1"}]
},
"doc2": {
"id": "doc2",
"path": "test2.md",
"type": "md",
"doc_name": "Document 2",
"doc_description": "Second test doc",
"structure": [{"title": "Header 2", "line_num": 1, "text": "Content 2"}]
}
}

def test_get_document_multi(mock_documents):
with patch("pageindex.retrieve._count_pages", return_value=5):
result_json = get_document(mock_documents, ["doc1", "doc2"])
result = json.loads(result_json)

assert "doc1" in result
assert "doc2" in result
assert result["doc1"]["doc_name"] == "Document 1"
assert result["doc1"]["page_count"] == 5
assert result["doc2"]["line_count"] == 5

def test_get_document_structure_multi(mock_documents):
result_json = get_document_structure(mock_documents, ["doc1", "doc2"])
result = json.loads(result_json)

assert "doc1" in result
assert "doc2" in result
# Verify text field is removed
assert "text" not in result["doc1"][0]
assert result["doc1"][0]["title"] == "Section 1"

def test_get_page_content_multi(mock_documents):
with patch("pageindex.retrieve._get_pdf_page_content", return_value=[{"page": 1, "content": "PDF Content"}]), \
patch("pageindex.retrieve._get_md_page_content", return_value=[{"page": 1, "content": "MD Content"}]):

result_json = get_page_content(mock_documents, ["doc1", "doc2"], "1")
result = json.loads(result_json)

assert "doc1" in result
assert "doc2" in result
assert result["doc1"][0]["content"] == "PDF Content"
assert result["doc2"][0]["content"] == "MD Content"

def test_get_document_multi_with_invalid_id(mock_documents):
with patch("pageindex.retrieve._count_pages", return_value=5):
result_json = get_document(mock_documents, ["doc1", "invalid-id"])
result = json.loads(result_json)

assert "doc1" in result
assert "invalid-id" in result
assert "error" in result["invalid-id"]
assert "not found" in result["invalid-id"]["error"]

def test_backward_compatibility(mock_documents):
# Single doc_id as string should return flat result, not nested
with patch("pageindex.retrieve._count_pages", return_value=5):
result_json = get_document(mock_documents, "doc1")
result = json.loads(result_json)

assert "doc_id" in result
assert result["doc_id"] == "doc1"
assert "doc1" not in result # Should not be nested
24 changes: 16 additions & 8 deletions pageindex/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,18 +217,26 @@ def _ensure_doc_loaded(self, doc_id: str):
if full.get('pages'):
doc['pages'] = full['pages']

def get_document(self, doc_id: str) -> str:
"""Return document metadata JSON."""
def get_document(self, doc_id) -> str:
"""Return document metadata JSON. doc_id can be a string or a list of strings."""
return get_document(self.documents, doc_id)

def get_document_structure(self, doc_id: str) -> str:
"""Return document tree structure JSON (without text fields)."""
def get_document_structure(self, doc_id) -> str:
"""Return document tree structure JSON (without text fields). doc_id can be a string or a list of strings."""
if self.workspace:
self._ensure_doc_loaded(doc_id)
if isinstance(doc_id, list):
for d_id in doc_id:
self._ensure_doc_loaded(d_id)
else:
self._ensure_doc_loaded(doc_id)
return get_document_structure(self.documents, doc_id)

def get_page_content(self, doc_id: str, pages: str) -> str:
"""Return page content for the given pages string (e.g. '5-7', '3,8', '12')."""
def get_page_content(self, doc_id, pages: str) -> str:
"""Return page content for the given pages string (e.g. '5-7', '3,8', '12'). doc_id can be a string or a list of strings."""
if self.workspace:
self._ensure_doc_loaded(doc_id)
if isinstance(doc_id, list):
for d_id in doc_id:
self._ensure_doc_loaded(d_id)
else:
self._ensure_doc_loaded(doc_id)
return get_page_content(self.documents, doc_id, pages)
Loading