digitize/digitize.py at main · dratasich/digitize · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python3

# %%
import argparse
import glob
import os

import PyPDF2
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from PIL.ExifTags import TAGS

# %%
desc = "Digitize scans and pdfs - convert images and pdfs to text files."
parser = argparse.ArgumentParser(description=desc)
parser.add_argument(
    "path",
    default=".",
    type=str,
    help=f"""Directory of files to digitize.""",
)
parser.add_argument(
    "-r",
    "--recursive",
    action="store_true",
    default=False,
    help=f"""Recursively search the directory for files to convert.""",
)
default_extensions = ["jpg", "png", "pdf"]
parser.add_argument(
    "-e",
    "--extension",
    type=str,
    nargs="+",
    default=default_extensions,
    help=f"""Extensions of files to search for (default: {default_extensions}).""",
)
default_exclude = ["IMG", "IMAG", "DSC"]
parser.add_argument(
    "--exclude",
    type=str,
    nargs="+",
    default=default_exclude,
    help=f"""Exclude files that contain default: {default_exclude}.""",
)
args = parser.parse_args()

# %% get inputs for OCR (images in directory)
# collect all files with the specified extensions
files = []
for ext in args.extension:
    files.extend(glob.glob(f"{args.path}/**/*.{ext}", recursive=args.recursive))
print(f"{len(files)} with extensions {args.extension}")
# exclude files given some patterns
for filename in files:
    for s in args.exclude:
        if s in filename:
            print(f"Drop {filename} because '{s}' is contained in the filename")
            files.remove(filename)
            break
print(f"{len(files)} files after excluding {args.exclude}")


# %% save text:
def save(text: str, to: str) -> bool:
    with open(to, "w") as o:
        o.write(text)
    print(f"Wrote {to}")


# %% convert pdf
def pdf2text(path: str) -> str:
    print(f"Run pdf reader on {f}...")
    reader = PyPDF2.PdfReader(f)
    print(
        f"File '{f}': {len(reader.pages)} page(s) {reader.metadata} {reader.pdf_header}"
    )
    text = "".join(reader.pages[0].extract_text())
    if len(text) < 5:
        # no text in pdf -> convert to image and try with ocr
        doc = convert_from_path(path)
        text = "".join([pytesseract.image_to_string(page) for page in doc])
    return text


# %% get all meta data fields from an image
def image_metadata(img: Image):
    """
    https://www.thepythoncode.com/article/extracting-image-metadata-in-python
    """
    exifdata = image.getexif()
    meta = {}
    # iterating over all EXIF data fields
    for tag_id in exifdata:
        # get the tag name, instead of human unreadable tag id
        tag = TAGS.get(tag_id, tag_id)
        data = exifdata.get(tag_id)
        # decode bytes
        if isinstance(data, bytes):
            data = data.decode()
        meta[tag] = data
    return meta


# %% convert all files
print("---")
for f in files:
    try:
        outputfile = f"{os.path.splitext(f)[0]}_ocr.txt"
        if str(f).endswith(".pdf"):
            text = pdf2text(f)
        else:
            print(f"Run tesseract on {f}...")
            image = Image.open(f)
            meta = image_metadata(image)
            print(f"File '{f}': {image.format_description} {image.mode} {meta}")
            text = pytesseract.image_to_string(image, lang="eng+deu")
        if len(text) > 5:
            save(text, outputfile)
    except Exception as e:
        print(f"Failed to digitize {f}: {e}")