-
Notifications
You must be signed in to change notification settings - Fork 3
Python API
The main class for performing OCR tasks. It orchestrates the detection and recognition pipelines.
from kiri_ocr import OCR
ocr = OCR(model_path="mrrtmob/kiri-ocr", device="cpu")Initializes the OCR engine, loading detection and recognition models.
def __init__(
self,
model_path="mrrtmob/kiri-ocr",
det_model_path=None,
det_method="db",
det_conf_threshold=0.5,
padding=10,
device="cpu",
verbose=False,
use_beam_search=False,
use_fp16=None
)Parameters:
| Parameter | Type | Default | Description |
|---|---|---|---|
model_path |
str | "mrrtmob/kiri-ocr" |
Path to model file (.safetensors, .pt) or HuggingFace repo ID |
det_model_path |
str | None |
Path to custom detector model |
det_method |
str | "db" |
Detection method: 'db', 'craft', or 'legacy'
|
det_conf_threshold |
float | 0.5 |
Confidence threshold for detection (0.0-1.0) |
padding |
int | 10 |
Pixels to pad around detected text boxes |
device |
str | "cpu" |
Computation device: 'cpu' or 'cuda'
|
verbose |
bool | False |
Print detailed logs |
use_beam_search |
bool | False |
Use beam search for higher quality (slower) |
use_fp16 |
bool | None |
Force FP16 precision on CUDA |
Decoding Options (use_beam_search):
| Value | Description | Speed | Quality |
|---|---|---|---|
False |
Greedy decoding (default) | ⚡⚡ Fast | Good |
True |
Beam search decoder | ⚡ Slower | Best |
Example:
# Default greedy decoding (fast)
ocr = OCR()
# Beam search for highest quality
ocr = OCR(use_beam_search=True, device="cuda")Extracts all text from a document image as a single string. Primary method for full document OCR.
def extract_text(self, image_path, mode="lines", verbose=False)Parameters:
-
image_path(str | Path): Input image path -
mode(str): Detection granularity -'lines'(default) or'words' -
verbose(bool): Print progress updates
Returns:
-
full_text(str): Combined text, lines separated by\n -
results(list[dict]): Detailed results for each region
Result Dictionary:
{
"box": [x, y, width, height], # Integer coordinates
"text": "Recognized text", # String
"confidence": 0.98, # Recognition confidence (0.0-1.0)
"det_confidence": 0.95, # Detection confidence (0.0-1.0)
"line_number": 1 # Sort order index
}Example:
text, results = ocr.extract_text('document.jpg')
print(text)
for line in results:
print(f"[{line['confidence']:.0%}] {line['text']}")Returns only the list of results (no combined string). Useful for structured data processing.
def process_document(self, image_path, mode="lines", verbose=False) -> list[dict]Recognize text from a cropped image of a single text line (skips detection).
def recognize_single_line_image(self, image_path)Parameters:
-
image_path(str | Path): Path to the cropped text line image
Returns:
-
text(str): Recognized text -
confidence(float): Confidence score
Example:
text, confidence = ocr.recognize_single_line_image('line.png')
print(f"'{text}' ({confidence:.1%})")Kiri OCR supports real-time streaming output, similar to LLM text generation. Characters are yielded as they're decoded, enabling live UI updates.
Full document OCR with character-by-character streaming.
def extract_text_stream_chars(
self,
image_path,
mode="lines",
use_beam_search=None,
verbose=False
) -> Generator[Dict, None, None]Parameters:
-
image_path(str | Path): Document image path -
mode(str):'lines'or'words' -
use_beam_search(bool, optional): Override instance setting -
verbose(bool): Print progress
Yields:
{
"token": "ក", # New character (empty at region start)
"text": "ការ", # Current region text so far
"cumulative_text": "Line 1\nLine 2", # All text so far
"region_number": 2, # Current region (1-indexed)
"total_regions": 5, # Total detected regions
"step": 3, # Decoding step in current region
"confidence": 0.95, # Token confidence
"region_finished": False, # Is current region done?
"document_finished": False, # Is entire document done?
"region_start": False, # Is this start of new region?
"box": [10, 20, 200, 30], # Current region bounding box
"det_confidence": 0.92 # Detection confidence
}Example - Live Terminal Output:
for chunk in ocr.extract_text_stream_chars('document.png'):
if chunk['region_start']:
print(f"\n[Line {chunk['region_number']}] ", end='')
print(chunk['token'], end='', flush=True)
if chunk['document_finished']:
print("\n✅ Done!")Example - Progress UI:
for chunk in ocr.extract_text_stream_chars('document.png'):
progress = chunk['region_number'] / chunk['total_regions']
update_progress_bar(progress)
update_text_display(chunk['cumulative_text'])Stream characters from a single-line image (no detection).
def recognize_streaming(
self,
image_path,
use_beam_search=None
) -> Generator[Dict, None, None]Yields:
{
"token": "a", # New character
"token_id": 42, # Token ID
"text": "abc", # Full text so far
"confidence": 0.98, # Token confidence
"step": 3, # Current step
"finished": False # Is decoding complete?
}Example:
for chunk in ocr.recognize_streaming('line.png'):
print(chunk['token'], end='', flush=True)Low-level streaming from a preprocessed tensor.
def recognize_region_streaming(
self,
image_tensor,
use_beam_search=None
) -> Generator[Dict, None, None]Stream region-by-region (entire regions, not characters).
def extract_text_streaming(
self,
image_path,
mode="lines",
verbose=False
) -> Generator[Dict, None, None]Yields after each region is fully recognized, useful when you don't need character-level streaming.
Recognize text from a preprocessed image tensor.
def recognize_region(self, image_tensor) -> Tuple[str, float]Parameters:
-
image_tensor(torch.Tensor): Preprocessed tensor frompreprocess_pil()
Returns:
-
text(str): Recognized text -
confidence(float): Confidence score
from kiri_ocr import OCR
ocr = OCR(device='cuda')
# Full document
text, results = ocr.extract_text('invoice.png')
print(text)from kiri_ocr import OCR
from pathlib import Path
ocr = OCR() # Default greedy decoding
for image in Path('documents/').glob('*.jpg'):
text, _ = ocr.extract_text(image)
print(f"{image.name}: {len(text)} chars")import tkinter as tk
from kiri_ocr import OCR
def process_with_streaming(image_path):
ocr = OCR()
for chunk in ocr.extract_text_stream_chars(image_path):
# Update UI with each character
text_widget.insert('end', chunk['token'])
text_widget.update()
if chunk['document_finished']:
status_label.config(text='Done!')from kiri_ocr import OCR
# Use beam search for best accuracy on important documents
ocr = OCR(use_beam_search=True, device='cuda')
text, results = ocr.extract_text('contract.png')
# Check confidence
low_conf = [r for r in results if r['confidence'] < 0.9]
if low_conf:
print(f"Warning: {len(low_conf)} regions have low confidence")Kiri OCR Home | GitHub Repository | Report Issue
© 2026 Kiri OCR. Released under the Apache 2.0 License.