ipac2text/main.py at main · ChalmersLibrary/ipac2text · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import json
import re
import sys
from dotenv import load_dotenv
from ocr import ocr_with_claude
import requests

load_dotenv()

# Variables
libris_match = 0
chalmers_holdings = 0
lquery = ''
libris_search_api_endpoint = 'https://libris.kb.se/find'
title = ''
authors = ''

# Script for testing the Claude Vision OCR functions with scanned images from Chalmers IPAC. The extracted data is printed as JSON.

if len(sys.argv) < 2:
    print("Usage: python3 main.py <image_path>")
    sys.exit(1)

image = sys.argv[1]
result = ocr_with_claude(image)
print("Response for " + image + ": ", json.dumps(result, indent=2, ensure_ascii=False))

# Lookup in Libris

if result.get("title"):
    lquery = result["title"]

if result.get("authors"):
    lquery += " " + " ".join(result["authors"])

if result.get("editors"):
    lquery += " " + " ".join(result["editors"])

if result.get("year_of_publication"):
    lquery += " " + result["year_of_publication"]

#if result.get("pages"):
#    lquery += " " + re.sub(r"\D", "", result["pages"])

lquery = lquery.replace(". ", "+").replace(", ", "+")

print('Libris query: ' + lquery)

headers = {'Accept': 'application/json',
           'User-Agent': 'cth-ipac-ocr/1.0'}

libris_lookup = libris_search_api_endpoint + '?q=' + lquery + '&_limit=1'

data = requests.get(url=libris_lookup, headers=headers).text
# convert string to Json
libris_data = json.loads(data)

# print(libris_data)

if libris_data and libris_data['totalItems'] > 0:
    libris_match = 1
    print("Libris match found for query: " + lquery)

    print("  ID:", libris_data['items'][0]['@id'] if '@id' in libris_data['items'][0] else '')

    # Extract additional fields from the response
    libris_year = libris_data['items'][0]['publication'][0]['librissearch:year_4_digits_short'] if 'librissearch:year_4_digits_short' in libris_data['items'][0]['publication'][0] else ''
    if 'hasTitle' in libris_data['items'][0]:
        libris_title = next((t["mainTitle"] for t in libris_data['items'][0]["hasTitle"] if t["@type"] == "Title"), None)
    else:
        libris_title = ''
    libris_id = libris_data['items'][0]['@id'] if '@id' in libris_data['items'][0] else ''
    libris_type = libris_data['items'][0]['instanceOf']['@type'] if '@type' in libris_data['items'][0]['instanceOf'] else ''

    print("Libris match details:")
    print("  ID:", libris_id)
    print("  Type:", libris_type)
    print("  Title:", libris_title)
    print("  Year:", libris_year)


# TODO
# Batch processing of multiple images
# Looking up records in other databases (WorldCat, LC...)
# Create MarcXML record (or something else) from the extracted data?
# Logging and error handling
# Something else?