-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdigitize.py
More file actions
executable file
·122 lines (110 loc) · 3.37 KB
/
digitize.py
File metadata and controls
executable file
·122 lines (110 loc) · 3.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python3
# %%
import argparse
import glob
import os
import PyPDF2
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from PIL.ExifTags import TAGS
# %%
desc = "Digitize scans and pdfs - convert images and pdfs to text files."
parser = argparse.ArgumentParser(description=desc)
parser.add_argument(
"path",
default=".",
type=str,
help=f"""Directory of files to digitize.""",
)
parser.add_argument(
"-r",
"--recursive",
action="store_true",
default=False,
help=f"""Recursively search the directory for files to convert.""",
)
default_extensions = ["jpg", "png", "pdf"]
parser.add_argument(
"-e",
"--extension",
type=str,
nargs="+",
default=default_extensions,
help=f"""Extensions of files to search for (default: {default_extensions}).""",
)
default_exclude = ["IMG", "IMAG", "DSC"]
parser.add_argument(
"--exclude",
type=str,
nargs="+",
default=default_exclude,
help=f"""Exclude files that contain default: {default_exclude}.""",
)
args = parser.parse_args()
# %% get inputs for OCR (images in directory)
# collect all files with the specified extensions
files = []
for ext in args.extension:
files.extend(glob.glob(f"{args.path}/**/*.{ext}", recursive=args.recursive))
print(f"{len(files)} with extensions {args.extension}")
# exclude files given some patterns
for filename in files:
for s in args.exclude:
if s in filename:
print(f"Drop {filename} because '{s}' is contained in the filename")
files.remove(filename)
break
print(f"{len(files)} files after excluding {args.exclude}")
# %% save text:
def save(text: str, to: str) -> bool:
with open(to, "w") as o:
o.write(text)
print(f"Wrote {to}")
# %% convert pdf
def pdf2text(path: str) -> str:
print(f"Run pdf reader on {f}...")
reader = PyPDF2.PdfReader(f)
print(
f"File '{f}': {len(reader.pages)} page(s) {reader.metadata} {reader.pdf_header}"
)
text = "".join(reader.pages[0].extract_text())
if len(text) < 5:
# no text in pdf -> convert to image and try with ocr
doc = convert_from_path(path)
text = "".join([pytesseract.image_to_string(page) for page in doc])
return text
# %% get all meta data fields from an image
def image_metadata(img: Image):
"""
https://www.thepythoncode.com/article/extracting-image-metadata-in-python
"""
exifdata = image.getexif()
meta = {}
# iterating over all EXIF data fields
for tag_id in exifdata:
# get the tag name, instead of human unreadable tag id
tag = TAGS.get(tag_id, tag_id)
data = exifdata.get(tag_id)
# decode bytes
if isinstance(data, bytes):
data = data.decode()
meta[tag] = data
return meta
# %% convert all files
print("---")
for f in files:
try:
outputfile = f"{os.path.splitext(f)[0]}_ocr.txt"
if str(f).endswith(".pdf"):
text = pdf2text(f)
else:
print(f"Run tesseract on {f}...")
image = Image.open(f)
meta = image_metadata(image)
print(f"File '{f}': {image.format_description} {image.mode} {meta}")
text = pytesseract.image_to_string(image, lang="eng+deu")
if len(text) > 5:
save(text, outputfile)
except Exception as e:
print(f"Failed to digitize {f}: {e}")