Skip to content

Commit b7039cc

Browse files
committed
Updated the examples
1 parent 9e20da2 commit b7039cc

File tree

8 files changed

+1397
-316
lines changed

8 files changed

+1397
-316
lines changed

README.md

Lines changed: 445 additions & 289 deletions
Large diffs are not rendered by default.

api.md

Lines changed: 396 additions & 27 deletions
Large diffs are not rendered by default.

examples/README.md

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# Docstrange Python SDK Examples
2+
3+
This directory contains practical examples demonstrating common use cases for the Docstrange Python SDK.
4+
5+
## Prerequisites
6+
7+
1. Install the SDK:
8+
```bash
9+
pip install docstrange-api
10+
```
11+
12+
2. Set your API key:
13+
```bash
14+
export DOCSTRANGE_API_KEY="your-api-key"
15+
```
16+
17+
## Examples
18+
19+
### Basic Extraction
20+
**File:** `basic_extraction.py`
21+
22+
The simplest way to extract content from a document as Markdown.
23+
24+
```bash
25+
python basic_extraction.py invoice.pdf
26+
```
27+
28+
### JSON Extraction
29+
**File:** `json_extraction.py`
30+
31+
Extract structured data with specific fields from invoices or forms.
32+
33+
```bash
34+
python json_extraction.py invoice.pdf
35+
```
36+
37+
### Streaming Extraction
38+
**File:** `streaming_extraction.py`
39+
40+
Real-time extraction using Server-Sent Events. Content is displayed progressively as it's generated.
41+
42+
```bash
43+
python streaming_extraction.py large_document.pdf
44+
```
45+
46+
### Async with Polling
47+
**File:** `async_with_polling.py`
48+
49+
Submit a document for async processing and poll for completion. Useful for large documents.
50+
51+
```bash
52+
python async_with_polling.py large_document.pdf
53+
```
54+
55+
### Document Classification
56+
**File:** `document_classification.py`
57+
58+
Classify documents into custom categories with confidence scores.
59+
60+
```bash
61+
python document_classification.py unknown_document.pdf
62+
```
63+
64+
## Running All Examples
65+
66+
```bash
67+
# Make examples executable
68+
chmod +x *.py
69+
70+
# Run with a test document
71+
./basic_extraction.py test.pdf
72+
./json_extraction.py invoice.pdf
73+
./streaming_extraction.py document.pdf
74+
./document_classification.py document.pdf
75+
./async_with_polling.py large_doc.pdf
76+
```
77+
78+
## Tips
79+
80+
- **Input methods**: Examples use `file=` (binary), `file_base64=`, or `file_url=` depending on use case
81+
- **Multiple formats**: Request multiple outputs with `output_format="markdown,json"`
82+
- **Metadata**: Add `include_metadata="bounding_boxes"` for coordinate data
83+
- **Custom extraction**: Use `custom_instructions` for specific extraction requirements

examples/async_with_polling.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Async Extraction with Polling Example
4+
5+
This example demonstrates asynchronous document processing with polling.
6+
Useful for large documents that take longer to process.
7+
8+
Usage:
9+
export DOCSTRANGE_API_KEY="your-api-key"
10+
python async_with_polling.py document.pdf
11+
"""
12+
13+
import base64
14+
import sys
15+
import time
16+
from pathlib import Path
17+
18+
from docstrange import Docstrange
19+
20+
21+
def poll_until_complete(client: Docstrange, record_id: str, timeout: int = 120):
22+
"""Poll for extraction completion with timeout."""
23+
start_time = time.time()
24+
poll_interval = 2 # seconds
25+
26+
while time.time() - start_time < timeout:
27+
result = client.extract.results.retrieve(record_id)
28+
29+
if result.status == "completed":
30+
return result
31+
elif result.status == "failed":
32+
raise Exception(f"Extraction failed: {result.message}")
33+
34+
# Show progress
35+
elapsed = int(time.time() - start_time)
36+
print(f" Status: {result.status} (elapsed: {elapsed}s)", end="\r")
37+
38+
time.sleep(poll_interval)
39+
40+
raise TimeoutError(f"Extraction did not complete within {timeout} seconds")
41+
42+
43+
def main():
44+
if len(sys.argv) < 2:
45+
print("Usage: python async_with_polling.py <file_path>")
46+
sys.exit(1)
47+
48+
file_path = Path(sys.argv[1])
49+
if not file_path.exists():
50+
print(f"Error: File not found: {file_path}")
51+
sys.exit(1)
52+
53+
client = Docstrange()
54+
55+
# Read file as base64
56+
with open(file_path, "rb") as f:
57+
file_base64 = base64.b64encode(f.read()).decode("utf-8")
58+
59+
print(f"Submitting for async processing: {file_path.name}")
60+
print("-" * 50)
61+
62+
# Submit for async processing
63+
job = client.extract.async_(
64+
file_base64=file_base64,
65+
output_format="markdown,json", # Multiple formats
66+
)
67+
68+
print(f"✓ Job submitted")
69+
print(f" Record ID: {job.record_id}")
70+
print(f" Status: {job.status}")
71+
print()
72+
print("Polling for completion...")
73+
74+
try:
75+
result = poll_until_complete(client, job.record_id, timeout=120)
76+
print() # Clear the status line
77+
print(f"\n✓ Extraction completed in {result.processing_time:.2f}s")
78+
print(f" Pages processed: {result.pages_processed}")
79+
80+
# Show results
81+
print("\n" + "=" * 50)
82+
print("MARKDOWN RESULT (first 500 chars):")
83+
print("=" * 50)
84+
markdown = result.result.markdown.content
85+
print(markdown[:500] + ("..." if len(markdown) > 500 else ""))
86+
87+
print("\n" + "=" * 50)
88+
print("JSON RESULT:")
89+
print("=" * 50)
90+
import json
91+
print(json.dumps(result.result.json_.content, indent=2)[:1000])
92+
93+
except TimeoutError as e:
94+
print(f"\n{e}")
95+
print(f"You can check the status later with record_id: {job.record_id}")
96+
sys.exit(1)
97+
except Exception as e:
98+
print(f"\n✗ Error: {e}")
99+
sys.exit(1)
100+
101+
102+
if __name__ == "__main__":
103+
main()

examples/basic_extraction.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Basic Document Extraction Example
4+
5+
This example demonstrates the simplest way to extract content from a document
6+
using the Docstrange Python SDK.
7+
8+
Usage:
9+
export DOCSTRANGE_API_KEY="your-api-key"
10+
python basic_extraction.py document.pdf
11+
"""
12+
13+
import sys
14+
from pathlib import Path
15+
16+
from docstrange import Docstrange
17+
18+
19+
def main():
20+
if len(sys.argv) < 2:
21+
print("Usage: python basic_extraction.py <file_path>")
22+
print("Example: python basic_extraction.py invoice.pdf")
23+
sys.exit(1)
24+
25+
file_path = Path(sys.argv[1])
26+
if not file_path.exists():
27+
print(f"Error: File not found: {file_path}")
28+
sys.exit(1)
29+
30+
# Initialize the client (reads DOCSTRANGE_API_KEY from environment)
31+
client = Docstrange()
32+
33+
print(f"Extracting content from: {file_path.name}")
34+
print("-" * 50)
35+
36+
# Extract content as Markdown
37+
with open(file_path, "rb") as f:
38+
response = client.extract.sync(
39+
file=f,
40+
output_format="markdown",
41+
)
42+
43+
if response.success:
44+
print(f"✓ Extraction completed in {response.processing_time:.2f}s")
45+
print(f" Record ID: {response.record_id}")
46+
print(f" Pages processed: {response.pages_processed}")
47+
print()
48+
print("=" * 50)
49+
print("EXTRACTED CONTENT:")
50+
print("=" * 50)
51+
print(response.result.markdown.content)
52+
else:
53+
print(f"✗ Extraction failed: {response.message}")
54+
sys.exit(1)
55+
56+
57+
if __name__ == "__main__":
58+
main()
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Document Classification Example
4+
5+
This example demonstrates how to classify documents into custom categories.
6+
7+
Usage:
8+
export DOCSTRANGE_API_KEY="your-api-key"
9+
python document_classification.py document.pdf
10+
"""
11+
12+
import json
13+
import sys
14+
from pathlib import Path
15+
16+
from docstrange import Docstrange
17+
18+
19+
def main():
20+
if len(sys.argv) < 2:
21+
print("Usage: python document_classification.py <file_path>")
22+
sys.exit(1)
23+
24+
file_path = Path(sys.argv[1])
25+
if not file_path.exists():
26+
print(f"Error: File not found: {file_path}")
27+
sys.exit(1)
28+
29+
client = Docstrange()
30+
31+
# Define classification categories
32+
categories = [
33+
{
34+
"name": "Invoice",
35+
"description": "Bills, invoices, payment requests, and receipts"
36+
},
37+
{
38+
"name": "Contract",
39+
"description": "Legal agreements, contracts, terms of service"
40+
},
41+
{
42+
"name": "Resume",
43+
"description": "CVs, resumes, job applications"
44+
},
45+
{
46+
"name": "Report",
47+
"description": "Business reports, analysis documents, presentations"
48+
},
49+
{
50+
"name": "Letter",
51+
"description": "Business letters, correspondence, memos"
52+
},
53+
{
54+
"name": "Form",
55+
"description": "Application forms, government forms, questionnaires"
56+
},
57+
]
58+
59+
print(f"Classifying: {file_path.name}")
60+
print(f"Categories: {', '.join(c['name'] for c in categories)}")
61+
print("-" * 50)
62+
63+
with open(file_path, "rb") as f:
64+
response = client.classify.sync(
65+
file=f,
66+
categories=json.dumps(categories),
67+
)
68+
69+
if response.success and response.result:
70+
result = response.result
71+
print(f"\n✓ Classification completed")
72+
print(f" File: {result.filename}")
73+
print(f" Total pages: {result.total_pages}")
74+
print(f" Processing time: {result.processing_time:.2f}s")
75+
print()
76+
77+
print("PAGE-BY-PAGE CLASSIFICATION:")
78+
print("=" * 50)
79+
80+
for page in result.pages:
81+
confidence_bar = "█" * (page.confidence // 10) + "░" * (10 - page.confidence // 10)
82+
print(f"\nPage {page.page_number}:")
83+
print(f" Category: {page.category}")
84+
print(f" Confidence: [{confidence_bar}] {page.confidence}%")
85+
print(f" Reasoning: {page.reasoning}")
86+
87+
if page.identified_category:
88+
print(f" Note: Model identified as '{page.identified_category}' but categorized as 'Other'")
89+
90+
# Summary
91+
print("\n" + "=" * 50)
92+
print("SUMMARY:")
93+
94+
# Count pages per category
95+
category_counts = {}
96+
for page in result.pages:
97+
cat = page.category
98+
category_counts[cat] = category_counts.get(cat, 0) + 1
99+
100+
for cat, count in sorted(category_counts.items(), key=lambda x: -x[1]):
101+
print(f" {cat}: {count} page(s)")
102+
103+
else:
104+
print(f"✗ Classification failed: {response.message}")
105+
sys.exit(1)
106+
107+
108+
if __name__ == "__main__":
109+
main()

0 commit comments

Comments
 (0)