Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 45 additions & 19 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,32 +81,58 @@ def main():
task = example_tasks[0]
print(f"Using example task: {task}")

# Execute task
# Execute first task
is_first_task = True
try:
result = agent.execute(task)

print("\n" + "="*80)
print("📊 FINAL RESULT")
print("="*80)
print(result)
print()

# Show artifacts
artifacts = list(ARTIFACTS_DIR.glob("*"))
if artifacts:
print(f"\n📁 Generated {len(artifacts)} artifacts in {ARTIFACTS_DIR}/")
recent = sorted(artifacts, key=lambda p: p.stat().st_mtime, reverse=True)[:5]
for artifact in recent:
size = artifact.stat().st_size
print(f" - {artifact.name} ({size} bytes)")
while True:
try:
result = agent.execute(task, is_followup=not is_first_task)

print("\n" + "="*80)
print("📊 RESULT")
print("="*80)
print(result)
print()

# Show artifacts
artifacts = list(ARTIFACTS_DIR.glob("*"))
if artifacts:
print(f"\n📁 Generated {len(artifacts)} artifacts in {ARTIFACTS_DIR}/")
recent = sorted(artifacts, key=lambda p: p.stat().st_mtime, reverse=True)[:5]
for artifact in recent:
size = artifact.stat().st_size
print(f" - {artifact.name} ({size} bytes)")

# Ask for follow-up task
print("\n" + "="*80)
print("💬 You can continue with a follow-up task in this session")
print(" (Type 'exit', 'quit', or 'q' to end the session)")
print("="*80)
followup = input("\nEnter follow-up task (or 'exit' to end): ").strip()

if not followup or followup.lower() in ['exit', 'quit', 'q']:
print("\n👋 Ending session...")
break

task = followup
is_first_task = False

except KeyboardInterrupt:
print("\n\n⚠️ Task interrupted by user")
break

except KeyboardInterrupt:
print("\n\n⚠️ Task interrupted by user")
except Exception as e:
print(f"\n❌ Error: {e}")
import traceback
traceback.print_exc()

finally:
# Always generate final interpretation and close session
try:
agent.close_session()
except Exception as e:
print(f"⚠️ Error during session close: {e}")

print("\n" + "="*80)
print("👋 Thank you for using Vision-LLM Web Agent!")
print("="*80 + "\n")
Expand Down
114 changes: 113 additions & 1 deletion tests/test_file_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@
extract_pdf_text,
extract_pdf_images,
save_or_crop_image,
write_text_to_file
write_text_to_file,
ocr_image_to_text
)
from vision_llm_web_agent.tools.browser_control import browser_state
from PIL import Image, ImageDraw,ImageFont
import pytesseract


@pytest.fixture
Expand Down Expand Up @@ -81,6 +84,34 @@ def sample_image(tmp_path):
img.save(img_path)
return img_path

@pytest.fixture
def sample_image_for_ocr(tmp_path):
"""
创建一个包含可识别文本的图像,用于 OCR 测试。
注意:这需要系统安装了字体文件才能正确工作。
"""
img_path = tmp_path / "ocr_test_image.png"

# 创建一个白色背景的图像
img = Image.new('RGB', (400, 100), color='white')
draw = ImageDraw.Draw(img)

test_text = "OCR Test: Hello World 123"

# 尝试使用一个通用字体,如果找不到可能会失败
try:
# 尝试加载一个相对可靠的默认字体(可能需要调整路径或使用系统字体名称)
font = ImageFont.truetype("arial.ttf", 24)
except IOError:
# Fallback to default font if the specified one is not found
font = ImageFont.load_default()

# 在图像上绘制文本
draw.text((10, 10), test_text, fill='black', font=font)

img.save(img_path)
return img_path, test_text # 返回路径和预期的文本内容


class TestDownloadPdfFile:
"""Test the download_pdf_file function"""
Expand Down Expand Up @@ -425,3 +456,84 @@ def test_crop_and_save_chain(self, sample_image, test_artifacts_dir):
img = Image.open(full_final)
assert img.size == (100, 100)




class TestOcrImageToText:
"""Test the ocr_image_to_text function"""

def test_ocr_basic_success(self, sample_image_for_ocr, test_artifacts_dir):
"""Test basic OCR and text saving"""
image_path, expected_text = sample_image_for_ocr
output_file_name = "test/ocr_result.txt"

# 运行 OCR 工具
result = ocr_image_to_text(str(image_path), output_file_name)

# 检查结果是否成功
assert "✅" in result, f"OCR failed or did not return success status: {result}"

# 检查输出文件是否存在于 artifacts 目录中
full_output_path = Path("artifacts") / output_file_name
assert full_output_path.exists()

# 读取内容并验证文本是否被识别(由于 OCR 准确性无法保证完美,我们检查内容长度和关键字)
extracted_content = full_output_path.read_text(encoding='utf-8').strip()

# OCR 引擎可能会在文本中添加换行符或其他空格
assert len(extracted_content) > 10, "Extracted text is too short or empty."
assert "Hello World" in extracted_content, "Expected text 'Hello World' not found in OCR output."


def test_ocr_creates_directory(self, sample_image_for_ocr, test_artifacts_dir):
"""Test that OCR output creates parent directories"""
image_path, _ = sample_image_for_ocr
output_file_name = "test/nested/ocr/output.txt"

# 运行 OCR 工具
result = ocr_image_to_text(str(image_path), output_file_name)

assert "✅" in result

# 检查目录是否被创建
full_output_path = Path("artifacts") / output_file_name
assert full_output_path.parent.exists()
assert full_output_path.exists()


def test_ocr_nonexistent_image(self, test_artifacts_dir):
"""Test OCR on a nonexistent image file"""
nonexistent_image = "nonexistent_ocr.png"
output_file_name = "test/ocr_fail.txt"

# 运行 OCR 工具
result = ocr_image_to_text(nonexistent_image, output_file_name)

# 检查结果是否为失败
assert "❌" in result
assert "file not found" in result

# 检查输出文件是否未被创建
full_output_path = Path("artifacts") / output_file_name
assert not full_output_path.exists()


def test_ocr_empty_output(self, sample_image, test_artifacts_dir):
"""Test OCR on an image with little or no recognizable text (e.g., solid color)"""
image_path = sample_image # This is a 200x200 blue image with a white square, little text
output_file_name = "test/ocr_empty.txt"

# 运行 OCR 工具
result = ocr_image_to_text(str(image_path), output_file_name)

# 检查结果可能是成功但有警告,或者失败
# 如果 pytesseract 成功运行但识别不出文字,通常会返回成功状态和警告。
assert "✅" in result or "⚠️" in result

# 检查输出文件是否存在
full_output_path = Path("artifacts") / output_file_name
assert full_output_path.exists()

# 检查内容是否为空或接近空
extracted_content = full_output_path.read_text(encoding='utf-8').strip()
assert len(extracted_content) < 100 # 应该只有很少的字符或空格
4 changes: 4 additions & 0 deletions tests/test_tesseract/tesseractTest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from PIL import Image
import pytesseract
text=pytesseract.image_to_string(Image.open('test_image.jpg'))
print(text)
Loading