Tabular-RAG-from-scratch/vision_processor.py at main · MadsDoodle/Tabular-RAG-from-scratch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from openai import OpenAI
from typing import List, Dict
from config import Config
import time

class VisionProcessor:
    def __init__(self):
        self.client = OpenAI(api_key=Config.OPENAI_API_KEY)
        self.model = Config.VISION_MODEL

    def analyze_image(self, image_base64: str, page_num: int, max_retries: int = 2) -> str:
        """Use GPT-4 Vision to analyze images, tables, and charts with retry logic"""

        prompt = """You are analyzing a page from an academic PDF document. Describe EVERYTHING you see in extreme detail.

**CRITICAL INSTRUCTIONS:**
1. You MUST provide a detailed analysis. Do not say you cannot analyze the image.
2. For ANY visualization (chart, graph, plot), you MUST explicitly state the EXACT COLORS used.
3. Start by identifying if there are any figures, charts, or graphs and number them (e.g., "Figure 1:", "Chart 1:").

**For Figures/Charts/Graphs (HIGHEST PRIORITY):**
- Figure number and title (e.g., "Figure 1: Distribution of shortest cycles")
- Type of visualization (line plot, bar chart, scatter plot, histogram, etc.)
- **COLORS - BE EXTREMELY SPECIFIC:**
  * List EVERY color used in the plot
  * For lines: "blue line with circle markers", "red line with plus markers"
  * For bars: "red bars", "blue bars"
  * For markers: "blue circles", "red crosses", "green triangles"
  * For legends: "Legend shows: 'core' in blue, 'ER' in red"
- Axis labels with full text and units (e.g., "X-axis: Cycles length", "Y-axis: # of shortest cycles")
- Axis scales and ranges (e.g., "X-axis from 2 to 16", "Y-axis logarithmic from 10^1 to 10^4")
- Data points and trends visible
- Any grid lines, tick marks, or annotations

**For Mathematical Content/Formulae:**
- Write out ALL equations, formulas, and mathematical expressions
- Describe mathematical notation, symbols, and variables
- Explain what each formula represents
- Note any numbered equations

**For Tables:**
- Table number and caption
- Complete table structure (rows, columns, headers)
- ALL data values
- Column names and units
- Any formatting or highlights

**For Text:**
- ALL readable text, including headers, paragraphs, captions
- Section numbers and titles
- Figure captions and references
- Any footnotes or references

**For Diagrams:**
- All components and their relationships
- Labels and annotations
- Arrows and connections
- Colors used for different elements

**REMEMBER: If you see a chart or graph, you MUST describe its colors explicitly!**"""

        for attempt in range(max_retries + 1):
            try:
                response = self.client.chat.completions.create(
                    model=self.model,
                    messages=[
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "text",
                                    "text": prompt
                                },
                                {
                                    "type": "image_url",
                                    "image_url": {
                                        "url": f"data:image/png;base64,{image_base64}",
                                        "detail": "high"
                                    }
                                }
                            ]
                        }
                    ],
                    max_tokens=2000,
                    temperature=0.2
                )

                content = response.choices[0].message.content

                # Check if the response is a failure message
                failure_indicators = [
                    "i'm unable to analyze",
                    "i cannot analyze",
                    "i can't analyze",
                    "unable to process",
                    "cannot process the image"
                ]

                content_lower = content.lower()
                is_failed = any(indicator in content_lower for indicator in failure_indicators)

                if is_failed and attempt < max_retries:
                    print(f"  Retry {attempt + 1}/{max_retries} for page {page_num} (generic response detected)...")
                    time.sleep(1)  # Brief delay before retry
                    continue

                return f"[Page {page_num} - Visual Content Analysis]\n{content}"

            except Exception as e:
                if attempt < max_retries:
                    print(f"  Retry {attempt + 1}/{max_retries} for page {page_num} (error: {str(e)[:50]})...")
                    time.sleep(1)
                    continue
                else:
                    print(f"Error analyzing image on page {page_num} after {max_retries + 1} attempts: {e}")
                    return f"[Page {page_num} - Visual Content]\nError processing image after multiple attempts."

        return f"[Page {page_num} - Visual Content]\nFailed to analyze image after {max_retries + 1} attempts."

    def process_images_batch(self, images_data: List[Dict]) -> List[Dict[str, any]]:
        """Process multiple images and return analyzed content"""
        analyzed_data = []

        for img_data in images_data:
            print(f"Analyzing page {img_data['page_num']}...")

            analysis = self.analyze_image(
                img_data['image_base64'],
                img_data['page_num']
            )

            analyzed_data.append({
                'page_num': img_data['page_num'],
                'content': analysis,
                'type': 'vision_analysis',
                'source': img_data['source']
            })

        return analyzed_data