diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e33cdc..fcd19d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ Possible sections are: ### Fixed: - avoid Z.AI OCR rate-limit failures in large batch runs by introducing OCR-specific concurrency control (`--ocr-workers`) and clearer HTTP 429 guidance ([#7](https://github.com/atsyplenkov/paperdown/issues/7)) +- align skip and output-reuse behavior with marker-based semantics: skip only when `//log.jsonl` exists; otherwise refresh managed artifacts and continue processing ([#11](https://github.com/atsyplenkov/paperdown/issues/11)) ## [0.2.0] - 2026-03-18 diff --git a/README.md b/README.md index 7201307..8d28c5f 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,8 @@ paperdown --input pdf/ --output md/ --workers 32 --ocr-workers 2 --overwrite `--workers` controls how many PDFs are processed concurrently in batch mode. `--ocr-workers` controls concurrent OCR API calls. Effective OCR concurrency is `min(--workers, --ocr-workers)`. +Without `--overwrite`, an existing `//log.jsonl` marker skips the PDF. If the log marker is missing, `paperdown` treats the PDF as unprocessed and refreshes managed artifacts (`index.md`, `figures/`, and `tables/` when `--normalize-tables` is enabled). With `--overwrite`, `paperdown` replaces the whole `//` folder before processing. + ## Installation Install from crates.io: @@ -91,7 +93,7 @@ Options: --workers Maximum number of PDFs processed concurrently in batch mode. [default: 32] --ocr-workers Maximum number of concurrent OCR API calls in batch mode; effective OCR concurrency is min(--workers, --ocr-workers). [default: 2] -v, --verbose Enable verbose progress messages on stderr. - --overwrite Replace existing managed output artifacts (index.md, figures/, and tables/ when enabled). + --overwrite Replace the whole // folder before processing. --normalize-tables Normalize OCR HTML tables into Markdown and store raw HTML under tables/. -h, --help Print help (see a summary with '-h') -V, --version Print version diff --git a/src/cli.rs b/src/cli.rs index 0340e00..d39a9d1 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -21,8 +21,9 @@ paperdown --input pdf/ --output md/ --workers 4\n \ paperdown --input pdf/ --output md/ --overwrite\n \ paperdown --input pdf/ --output md/ --normalize-tables\n\n\ Notes:\n \ -Without --overwrite, existing index.md or figures/ causes a failure.\n \ -When --normalize-tables is enabled, existing tables/ also causes a failure.\n \ +Without --overwrite, an existing //log.jsonl marker skips the PDF.\n \ +If the log marker is missing, paperdown treats the PDF as unprocessed and refreshes managed artifacts (index.md, figures/, and tables/ when enabled).\n \ +With --overwrite, the whole // folder is replaced.\n \ Progress bars are shown on stderr only when running in a TTY." )] pub struct Cli { @@ -91,7 +92,7 @@ pub struct Cli { #[arg( long, action = ArgAction::SetTrue, - help = "Replace existing managed output artifacts (index.md and figures/)." + help = "Replace the whole // folder before processing." )] pub overwrite: bool, @@ -168,6 +169,15 @@ mod tests { assert!(help.contains("Examples:")); assert!(help.contains("--overwrite")); assert!(help.contains("--normalize-tables")); + assert!(help.contains( + "Without --overwrite, an existing //log.jsonl marker skips the PDF." + )); + assert!(help.contains( + "If the log marker is missing, paperdown treats the PDF as unprocessed and refreshes managed artifacts (index.md, figures/, and tables/ when enabled)." + )); + assert!( + help.contains("With --overwrite, the whole // folder is replaced.") + ); let file_first = help.find("1) ZAI_API_KEY from --env-file"); let env_second = help.find("2) ZAI_API_KEY from environment"); assert!(file_first.is_some()); diff --git a/src/core/output.rs b/src/core/output.rs index 902eb4a..afa2fc9 100644 --- a/src/core/output.rs +++ b/src/core/output.rs @@ -14,6 +14,28 @@ pub(crate) struct PreparedOutput { pub(crate) log_path: PathBuf, } +fn remove_path_if_exists(path: &Path) -> Result<()> { + match std::fs::symlink_metadata(path) { + Ok(metadata) => { + if metadata.is_dir() { + std::fs::remove_dir_all(path)?; + } else { + std::fs::remove_file(path)?; + } + } + Err(err) if err.kind() == std::io::ErrorKind::NotFound => {} + Err(err) => return Err(err.into()), + } + Ok(()) +} + +fn validate_output_stem(stem: &str) -> Result<()> { + if stem.is_empty() || stem == "." || stem == ".." || stem.contains('/') || stem.contains('\\') { + return Err(anyhow::anyhow!("Invalid output stem: {stem}")); + } + Ok(()) +} + pub(crate) fn prepare_output_paths( output_root: &Path, pdf_path: &Path, @@ -24,8 +46,12 @@ pub(crate) fn prepare_output_paths( .file_stem() .and_then(|s| s.to_str()) .ok_or_else(|| anyhow::anyhow!("Invalid PDF filename: {}", pdf_path.display()))?; + validate_output_stem(stem)?; let output_dir = output_root.join(stem); + if overwrite { + remove_path_if_exists(&output_dir)?; + } std::fs::create_dir_all(&output_dir)?; let markdown_path = output_dir.join("index.md"); @@ -34,41 +60,16 @@ pub(crate) fn prepare_output_paths( let log_path = output_dir.join("log.jsonl"); if !overwrite { - if markdown_path.exists() { + if log_path.is_file() { return Err(anyhow::anyhow!( "Output already exists: {}. Re-run with --overwrite", - markdown_path.display() + log_path.display() )); } - if figures_dir.exists() { - return Err(anyhow::anyhow!( - "Output already exists: {}. Re-run with --overwrite", - figures_dir.display() - )); - } - if normalize_tables && tables_dir.exists() { - return Err(anyhow::anyhow!( - "Output already exists: {}. Re-run with --overwrite", - tables_dir.display() - )); - } - } else { - if markdown_path.exists() { - std::fs::remove_file(&markdown_path)?; - } - if figures_dir.exists() { - if figures_dir.is_dir() { - std::fs::remove_dir_all(&figures_dir)?; - } else { - std::fs::remove_file(&figures_dir)?; - } - } - if normalize_tables && tables_dir.exists() { - if tables_dir.is_dir() { - std::fs::remove_dir_all(&tables_dir)?; - } else { - std::fs::remove_file(&tables_dir)?; - } + remove_path_if_exists(&markdown_path)?; + remove_path_if_exists(&figures_dir)?; + if normalize_tables { + remove_path_if_exists(&tables_dir)?; } } @@ -125,3 +126,25 @@ pub(crate) async fn atomic_write_bytes(path: &Path, content: &[u8]) -> Result<() fs::rename(&temp_path, path).await?; Ok(()) } + +#[cfg(test)] +mod tests { + use super::validate_output_stem; + + #[test] + fn validate_output_stem_rejects_backslash() { + let err = validate_output_stem("a\\b").unwrap_err().to_string(); + assert!(err.contains("Invalid output stem")); + } + + #[test] + fn validate_output_stem_rejects_forward_slash() { + let err = validate_output_stem("a/b").unwrap_err().to_string(); + assert!(err.contains("Invalid output stem")); + } + + #[test] + fn validate_output_stem_accepts_normal_stem() { + assert!(validate_output_stem("paper").is_ok()); + } +} diff --git a/src/main.rs b/src/main.rs index a3ca449..ba302b8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -37,6 +37,10 @@ async fn run() -> Result { }; if pdfs.len() == 1 { + if !args.overwrite && has_existing_log_marker(&args.output, &pdfs[0]) { + print_single_skip_summary_stdout(&pdfs[0]); + return Ok(0); + } if args.verbose { eprintln!("Processing 1 PDF: {}", pdfs[0].display()); } @@ -57,18 +61,40 @@ async fn run() -> Result { return Ok(0); } - let workers = args.workers.min(pdfs.len()).max(1); + let total_inputs = pdfs.len(); + let mut skipped_count = 0usize; + let mut process_pdfs = Vec::new(); + for pdf in pdfs { + if !args.overwrite && has_existing_log_marker(&args.output, &pdf) { + skipped_count += 1; + } else { + process_pdfs.push(pdf); + } + } + + if process_pdfs.is_empty() { + let counts = batch_accounting(total_inputs, 0, skipped_count, 0, 0); + print_batch_summary_stdout( + counts.processed, + counts.skipped, + counts.failed, + counts.figures, + ); + return Ok(0); + } + + let workers = args.workers.min(process_pdfs.len()).max(1); let ocr_workers = effective_ocr_workers(workers, args.ocr_workers); eprintln!( "Processing {} PDFs with {} workers (OCR concurrency: {})...", - pdfs.len(), + process_pdfs.len(), workers, ocr_workers ); let semaphore = Arc::new(Semaphore::new(workers)); let ocr_semaphore = Arc::new(Semaphore::new(ocr_workers)); - let results = stream::iter(pdfs.into_iter().map(|pdf| { + let results = stream::iter(process_pdfs.into_iter().map(|pdf| { let permit_pool = semaphore.clone(); let ocr_limiter = ocr_semaphore.clone(); let output = args.output.clone(); @@ -118,8 +144,20 @@ async fn run() -> Result { } } - print_batch_summary_stdout(success_count, failed_count, downloaded_figures); - Ok(if failed_count > 0 { 1 } else { 0 }) + let counts = batch_accounting( + total_inputs, + success_count, + skipped_count, + failed_count, + downloaded_figures, + ); + print_batch_summary_stdout( + counts.processed, + counts.skipped, + counts.failed, + counts.figures, + ); + Ok(if counts.failed > 0 { 1 } else { 0 }) } fn stderr_is_tty() -> bool { @@ -141,6 +179,21 @@ fn stdout_is_tty() -> bool { std::io::stdout().is_terminal() } +fn has_existing_log_marker(output_root: &Path, pdf: &Path) -> bool { + let Some(stem) = pdf.file_stem() else { + return false; + }; + output_root.join(stem).join("log.jsonl").is_file() +} + +fn print_single_skip_summary_stdout(pdf: &Path) { + if stdout_is_tty() { + println!("\x1b[1;33mSkipped\x1b[0m {}", display_path(pdf)); + } else { + println!("Skipped {}", display_path(pdf)); + } +} + fn print_single_summary_stdout(summary: &PdfSummary) { if stdout_is_tty() { println!( @@ -165,7 +218,35 @@ fn print_single_summary_stdout(summary: &PdfSummary) { } } -fn print_batch_summary_stdout(processed: usize, failed: usize, figures: usize) { +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +struct BatchAccounting { + processed: usize, + skipped: usize, + failed: usize, + figures: usize, +} + +fn batch_accounting( + total_inputs: usize, + processed: usize, + skipped: usize, + failed: usize, + figures: usize, +) -> BatchAccounting { + assert_eq!( + processed + skipped + failed, + total_inputs, + "batch accounting invariant violated" + ); + BatchAccounting { + processed, + skipped, + failed, + figures, + } +} + +fn print_batch_summary_stdout(processed: usize, skipped: usize, failed: usize, figures: usize) { if stdout_is_tty() { let color = if failed == 0 { "\x1b[1;32m" @@ -173,10 +254,12 @@ fn print_batch_summary_stdout(processed: usize, failed: usize, figures: usize) { "\x1b[1;33m" }; println!( - "{color}Batch Complete\x1b[0m processed: \x1b[1m{processed}\x1b[0m failed: \x1b[1m{failed}\x1b[0m figures: \x1b[1m{figures}\x1b[0m" + "{color}Batch Complete\x1b[0m processed: \x1b[1m{processed}\x1b[0m skipped: \x1b[1m{skipped}\x1b[0m failed: \x1b[1m{failed}\x1b[0m figures: \x1b[1m{figures}\x1b[0m" ); } else { - println!("Batch Complete processed: {processed} failed: {failed} figures: {figures}"); + println!( + "Batch Complete processed: {processed} skipped: {skipped} failed: {failed} figures: {figures}" + ); } } @@ -298,7 +381,8 @@ mod tests { log_path: "/tmp/out/paper/log.jsonl".to_string(), }; print_single_summary_stdout(&summary); - print_batch_summary_stdout(2, 1, 4); + print_single_skip_summary_stdout(Path::new(&summary.pdf)); + print_batch_summary_stdout(2, 1, 1, 4); } #[test] @@ -325,4 +409,49 @@ mod tests { assert_eq!(effective_ocr_workers(8, 32), 8); assert_eq!(effective_ocr_workers(1, 2), 1); } + + #[test] + fn has_existing_log_marker_returns_true_when_log_file_exists() { + let temp = tempfile::tempdir().expect("tempdir"); + let output_root = temp.path(); + let pdf = temp.path().join("paper.pdf"); + std::fs::write(&pdf, b"%PDF-1.4").expect("create pdf"); + let log_path = output_root.join("paper").join("log.jsonl"); + std::fs::create_dir_all(log_path.parent().expect("log parent")).expect("create log dir"); + std::fs::write(&log_path, b"{}\n").expect("write log marker"); + + assert!(has_existing_log_marker(output_root, &pdf)); + } + + #[test] + fn has_existing_log_marker_returns_false_when_log_file_missing() { + let temp = tempfile::tempdir().expect("tempdir"); + let output_root = temp.path(); + let pdf = temp.path().join("paper.pdf"); + std::fs::write(&pdf, b"%PDF-1.4").expect("create pdf"); + + assert!(!has_existing_log_marker(output_root, &pdf)); + } + + mod main { + use super::*; + + mod tests { + use super::*; + + #[test] + fn batch_accounting_mixed_outcomes_is_consistent() { + let counts = batch_accounting(5, 2, 1, 2, 7); + assert_eq!( + counts, + BatchAccounting { + processed: 2, + skipped: 1, + failed: 2, + figures: 7 + } + ); + } + } + } } diff --git a/tests/cli_coverage.rs b/tests/cli_coverage.rs index 16ec953..216d444 100644 --- a/tests/cli_coverage.rs +++ b/tests/cli_coverage.rs @@ -66,7 +66,43 @@ fn cli_batch_reports_failed_count() { assert!(!output.status.success()); let stdout = String::from_utf8_lossy(&output.stdout); let stderr = String::from_utf8_lossy(&output.stderr); - assert!(stdout.contains("Batch Complete processed: 0 failed: 2 figures: 0")); + assert!(stdout.contains("Batch Complete processed: 0 skipped: 0 failed: 2 figures: 0")); assert!(stderr.contains("failed:")); assert!(stderr.contains("OCR concurrency: 1")); } + +#[test] +fn cli_single_pdf_skips_when_log_exists_and_env_missing() { + let tmp = TempDir::new().unwrap(); + let pdf = tmp.path().join("paper.pdf"); + std::fs::write(&pdf, b"%PDF-1.7\n").unwrap(); + + let output_dir = tmp.path().join("output"); + let paper_dir = output_dir.join("paper"); + std::fs::create_dir_all(&paper_dir).unwrap(); + std::fs::write(paper_dir.join("log.jsonl"), b"{}\n").unwrap(); + + let env_file = tmp.path().join("missing.env"); + + let mut cmd = Command::cargo_bin("paperdown").unwrap(); + let output = cmd + .current_dir(tmp.path()) + .args([ + "--input", + pdf.to_str().unwrap(), + "--output", + output_dir.to_str().unwrap(), + "--env-file", + env_file.to_str().unwrap(), + ]) + .env_remove("ZAI_API_KEY") + .output() + .unwrap(); + + assert!(output.status.success()); + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!(stdout.contains("Skipped")); + assert!(stdout.contains("paper.pdf")); + assert!(!stderr.contains("ZAI_API_KEY")); +} diff --git a/tests/cli_existing_output.rs b/tests/cli_existing_output.rs index e6e11f0..f9577f5 100644 --- a/tests/cli_existing_output.rs +++ b/tests/cli_existing_output.rs @@ -2,7 +2,7 @@ use assert_cmd::Command; use std::fs; #[test] -fn batch_existing_outputs_fail_before_env_or_ocr() { +fn batch_without_log_marker_reaches_env_lookup_even_with_stale_outputs() { let temp = tempfile::tempdir().expect("tempdir"); let pdf_dir = temp.path().join("pdf"); let out_dir = temp.path().join("md"); @@ -17,6 +17,10 @@ fn batch_existing_outputs_fail_before_env_or_ocr() { fs::create_dir_all(out_dir.join("b")).expect("out b"); fs::write(out_dir.join("a/index.md"), b"old").expect("index a"); fs::write(out_dir.join("b/index.md"), b"old").expect("index b"); + fs::create_dir_all(out_dir.join("a/figures")).expect("figures a"); + fs::create_dir_all(out_dir.join("b/figures")).expect("figures b"); + fs::write(out_dir.join("a/figures/stale.png"), b"old").expect("stale fig a"); + fs::write(out_dir.join("b/figures/stale.png"), b"old").expect("stale fig b"); let missing_env = temp.path().join("missing.env"); @@ -41,15 +45,62 @@ fn batch_existing_outputs_fail_before_env_or_ocr() { let stdout = String::from_utf8_lossy(&output.stdout); let stderr = String::from_utf8_lossy(&output.stderr); - assert!(stdout.contains("Batch Complete processed: 0 failed: 2 figures: 0")); + assert!(stdout.contains("Batch Complete processed: 0 skipped: 0 failed: 2 figures: 0")); assert!(stderr.contains("failed:")); assert!(stderr.contains("a.pdf")); assert!(stderr.contains("b.pdf")); - assert!(stderr.contains("Re-run with --overwrite")); assert!(stderr.contains("OCR concurrency:")); - assert!(!stderr.contains("ZAI_API_KEY")); + assert!(stderr.contains("ZAI_API_KEY")); + assert!(!stderr.contains("Re-run with --overwrite")); assert!(!stdout.contains("\u{1b}[")); assert!(!stderr.contains("\u{1b}[")); } + +#[test] +fn batch_existing_log_outputs_skip_without_env_or_ocr() { + let temp = tempfile::tempdir().expect("tempdir"); + let pdf_dir = temp.path().join("pdf"); + let out_dir = temp.path().join("output"); + fs::create_dir_all(&pdf_dir).expect("pdf dir"); + fs::create_dir_all(&out_dir).expect("output dir"); + + let pdf_a = pdf_dir.join("a.pdf"); + let pdf_b = pdf_dir.join("b.pdf"); + fs::write(&pdf_a, b"%PDF").expect("pdf a"); + fs::write(&pdf_b, b"%PDF").expect("pdf b"); + + fs::create_dir_all(out_dir.join("a")).expect("out a"); + fs::create_dir_all(out_dir.join("b")).expect("out b"); + fs::write(out_dir.join("a/log.jsonl"), b"{}\n").expect("log a"); + fs::write(out_dir.join("b/log.jsonl"), b"{}\n").expect("log b"); + + let missing_env = temp.path().join("missing.env"); + + let output = Command::cargo_bin("paperdown") + .expect("binary") + .args([ + "--input", + pdf_dir.to_str().expect("pdf path"), + "--output", + out_dir.to_str().expect("out path"), + "--workers", + "2", + "--env-file", + missing_env.to_str().expect("env path"), + ]) + .env_remove("ZAI_API_KEY") + .output() + .expect("run"); + + assert!(output.status.success()); + + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + + assert!(stdout.contains("Batch Complete processed: 0 skipped: 2 failed: 0 figures: 0")); + assert!(!stderr.contains("ZAI_API_KEY")); + assert!(!stderr.contains("OCR concurrency:")); + assert!(!stderr.contains("failed:")); +} diff --git a/tests/core_internal.rs b/tests/core_internal.rs index 0915f19..08424be 100644 --- a/tests/core_internal.rs +++ b/tests/core_internal.rs @@ -764,7 +764,7 @@ fn strip_html_img_alt_attributes_keeps_localized_image_urls() { } #[test] -fn prepare_output_without_overwrite_fails_on_existing_managed_artifacts() { +fn prepare_output_without_overwrite_replaces_existing_index_when_log_missing() { let tmp = TempDir::new().unwrap(); let pdf = tmp.path().join("paper.pdf"); std::fs::write(&pdf, b"%PDF").unwrap(); @@ -772,45 +772,63 @@ fn prepare_output_without_overwrite_fails_on_existing_managed_artifacts() { std::fs::create_dir_all(&target).unwrap(); std::fs::write(target.join("index.md"), b"old").unwrap(); - let err = prepare_output_paths(&tmp.path().join("out"), &pdf, false, false) - .unwrap_err() - .to_string(); - assert!(err.contains("--overwrite")); + let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, false, false).unwrap(); + assert_eq!(prepared.markdown_path, target.join("index.md")); + assert!(!prepared.markdown_path.exists()); + assert!(prepared.figures_dir.is_dir()); } #[test] -fn prepare_output_without_overwrite_fails_when_only_figures_exists() { +fn prepare_output_without_overwrite_cleans_stale_figures_when_log_missing() { let tmp = TempDir::new().unwrap(); let pdf = tmp.path().join("paper.pdf"); std::fs::write(&pdf, b"%PDF").unwrap(); let target = tmp.path().join("out").join("paper"); std::fs::create_dir_all(target.join("figures")).unwrap(); + std::fs::write(target.join("figures").join("stale.png"), b"old").unwrap(); - let err = prepare_output_paths(&tmp.path().join("out"), &pdf, false, false) - .unwrap_err() - .to_string(); - assert!(err.contains("figures")); - assert!(err.contains("--overwrite")); + let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, false, false).unwrap(); + assert!(prepared.figures_dir.is_dir()); + assert!(!prepared.figures_dir.join("stale.png").exists()); +} + +#[test] +fn prepare_output_without_overwrite_cleans_index_and_figures_when_log_missing() { + let tmp = TempDir::new().unwrap(); + let pdf = tmp.path().join("paper.pdf"); + std::fs::write(&pdf, b"%PDF").unwrap(); + let target = tmp.path().join("out").join("paper"); + std::fs::create_dir_all(target.join("figures")).unwrap(); + std::fs::write(target.join("figures").join("stale.png"), b"old").unwrap(); + std::fs::write(target.join("index.md"), b"old").unwrap(); + + let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, false, false).unwrap(); + assert!(!prepared.markdown_path.exists()); + assert!(!prepared.figures_dir.join("stale.png").exists()); } #[test] -fn prepare_output_without_overwrite_fails_when_both_exist() { +fn prepare_output_without_overwrite_preserves_completed_output_when_log_exists() { let tmp = TempDir::new().unwrap(); let pdf = tmp.path().join("paper.pdf"); std::fs::write(&pdf, b"%PDF").unwrap(); let target = tmp.path().join("out").join("paper"); std::fs::create_dir_all(target.join("figures")).unwrap(); + std::fs::write(target.join("figures").join("stale.png"), b"old").unwrap(); std::fs::write(target.join("index.md"), b"old").unwrap(); + std::fs::write(target.join("log.jsonl"), b"{}\n").unwrap(); let err = prepare_output_paths(&tmp.path().join("out"), &pdf, false, false) .unwrap_err() .to_string(); - assert!(err.contains("index.md")); + assert!(err.contains("log.jsonl")); assert!(err.contains("--overwrite")); + assert!(target.join("index.md").exists()); + assert!(target.join("figures").join("stale.png").exists()); } #[test] -fn prepare_output_with_overwrite_preserves_unrelated_files() { +fn prepare_output_with_overwrite_removes_unrelated_files() { let tmp = TempDir::new().unwrap(); let pdf = tmp.path().join("paper.pdf"); std::fs::write(&pdf, b"%PDF").unwrap(); @@ -825,7 +843,22 @@ fn prepare_output_with_overwrite_preserves_unrelated_files() { let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, true, false).unwrap(); assert!(prepared.figures_dir.exists()); assert!(!prepared.figures_dir.join("stale.png").exists()); - assert!(out.join("keep.txt").exists()); + assert!(!out.join("keep.txt").exists()); +} + +#[test] +fn prepare_output_with_overwrite_replaces_output_file_path() { + let tmp = TempDir::new().unwrap(); + let pdf = tmp.path().join("paper.pdf"); + std::fs::write(&pdf, b"%PDF").unwrap(); + + let out_file = tmp.path().join("out").join("paper"); + std::fs::create_dir_all(tmp.path().join("out")).unwrap(); + std::fs::write(&out_file, b"stale").unwrap(); + + let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, true, false).unwrap(); + assert!(prepared.output_dir.is_dir()); + assert!(prepared.figures_dir.is_dir()); } #[test] @@ -862,6 +895,27 @@ fn prepare_output_with_normalize_tables_manages_tables_dir() { ); } +#[test] +fn prepare_output_without_overwrite_cleans_tables_when_enabled_and_log_missing() { + let tmp = TempDir::new().unwrap(); + let pdf = tmp.path().join("paper.pdf"); + std::fs::write(&pdf, b"%PDF").unwrap(); + let out = tmp.path().join("out").join("paper"); + std::fs::create_dir_all(out.join("tables")).unwrap(); + std::fs::write(out.join("tables").join("stale.html"), b"old").unwrap(); + + let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, false, true).unwrap(); + assert!(prepared.tables_dir.as_ref().unwrap().is_dir()); + assert!( + !prepared + .tables_dir + .as_ref() + .unwrap() + .join("stale.html") + .exists() + ); +} + #[test] fn prepare_output_without_overwrite_ignores_stale_tables_when_disabled() { let tmp = TempDir::new().unwrap(); @@ -874,6 +928,27 @@ fn prepare_output_without_overwrite_ignores_stale_tables_when_disabled() { assert!(prepared.tables_dir.is_none()); } +#[test] +fn prepare_output_rejects_unsafe_stems() { + let tmp = TempDir::new().unwrap(); + let output_root = tmp.path().join("out"); + std::fs::create_dir_all(&output_root).unwrap(); + + let dot_stem_pdf = tmp.path().join("..pdf"); + std::fs::write(&dot_stem_pdf, b"%PDF").unwrap(); + let err = prepare_output_paths(&output_root, &dot_stem_pdf, false, false) + .unwrap_err() + .to_string(); + assert!(err.contains("Invalid output stem")); + + let dotdot_stem_pdf = tmp.path().join("...pdf"); + std::fs::write(&dotdot_stem_pdf, b"%PDF").unwrap(); + let err = prepare_output_paths(&output_root, &dotdot_stem_pdf, false, false) + .unwrap_err() + .to_string(); + assert!(err.contains("Invalid output stem")); +} + #[test] fn extract_image_url_checks_fallback_keys() { let block = json!({ @@ -925,7 +1000,7 @@ fn load_api_key_parses_quoted_value() { } #[test] -fn process_pdf_checks_output_conflict_before_env_lookup() { +fn process_pdf_checks_log_conflict_before_env_lookup() { let _guard = env_lock().lock().unwrap(); unsafe { std::env::remove_var("ZAI_API_KEY"); @@ -938,7 +1013,7 @@ fn process_pdf_checks_output_conflict_before_env_lookup() { let output_root = tmp.path().join("out"); let output_dir = output_root.join("paper"); std::fs::create_dir_all(&output_dir).unwrap(); - std::fs::write(output_dir.join("index.md"), b"existing").unwrap(); + std::fs::write(output_dir.join("log.jsonl"), b"{}\n").unwrap(); let missing_env = tmp.path().join("missing.env"); let rt = tokio::runtime::Runtime::new().unwrap(); @@ -961,3 +1036,42 @@ fn process_pdf_checks_output_conflict_before_env_lookup() { assert!(err.contains("Re-run with --overwrite")); assert!(!err.contains("ZAI_API_KEY")); } + +#[test] +fn process_pdf_reaches_env_lookup_when_log_missing_despite_stale_outputs() { + let _guard = env_lock().lock().unwrap(); + unsafe { + std::env::remove_var("ZAI_API_KEY"); + } + + let tmp = TempDir::new().unwrap(); + let pdf = tmp.path().join("paper.pdf"); + std::fs::write(&pdf, b"%PDF").unwrap(); + + let output_root = tmp.path().join("out"); + let output_dir = output_root.join("paper"); + std::fs::create_dir_all(output_dir.join("figures")).unwrap(); + std::fs::write(output_dir.join("index.md"), b"existing").unwrap(); + std::fs::write(output_dir.join("figures").join("stale.png"), b"old").unwrap(); + + let missing_env = tmp.path().join("missing.env"); + let rt = tokio::runtime::Runtime::new().unwrap(); + let err = rt + .block_on(process_pdf( + &pdf, + &output_root, + &missing_env, + ProcessPdfOptions { + timeout: Duration::from_secs(1), + max_download_bytes: 1024, + overwrite: false, + normalize_tables: false, + progress: None, + }, + )) + .unwrap_err() + .to_string(); + + assert!(err.contains("ZAI_API_KEY")); + assert!(!err.contains("Re-run with --overwrite")); +}