From ff29d6109da177b9f81921f85afffa76fc1b467a Mon Sep 17 00:00:00 2001 From: Ellie <6687206+wizzeh@users.noreply.github.com> Date: Wed, 25 Feb 2026 13:25:50 -0800 Subject: [PATCH] feat: try pdftotext before marker-pdf for conversion Co-Authored-By: Claude Opus 4.6 --- src/main.rs | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index baab9fc..bd75521 100644 --- a/src/main.rs +++ b/src/main.rs @@ -304,16 +304,64 @@ fn pdftotext_output_is_good(text: &str) -> bool { ratio > 0.8 } +/// Try extracting text from a PDF using pdftotext. +/// Returns Some(text) if pdftotext succeeds and the output looks good, +/// None otherwise. +fn try_pdftotext(pdf_bytes: &[u8]) -> Option { + let tmp_dir = tempfile::tempdir().ok()?; + let pdf_path = tmp_dir.path().join("paper.pdf"); + std::fs::write(&pdf_path, pdf_bytes).ok()?; + + eprintln!("Trying pdftotext…"); + + let output = Command::new("pdftotext") + .arg("-layout") + .arg(&pdf_path) + .arg("-") + .output(); + + match output { + Err(e) if e.kind() == io::ErrorKind::NotFound => { + eprintln!("pdftotext not found on PATH, skipping simple extraction"); + None + } + Err(e) => { + eprintln!("pdftotext failed: {e}"); + None + } + Ok(o) if !o.status.success() => { + eprintln!("pdftotext exited with {}", o.status); + None + } + Ok(o) => { + let text = String::from_utf8_lossy(&o.stdout).into_owned(); + if pdftotext_output_is_good(&text) { + eprintln!("pdftotext output looks good, skipping marker"); + Some(text) + } else { + eprintln!("pdftotext output is low quality, falling back to marker"); + None + } + } + } +} + /// Write PDF bytes to a temp file, run marker_single, and return the /// resulting markdown. fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result { + // Try simple extraction first. + if let Some(text) = try_pdftotext(pdf_bytes) { + return Ok(text); + } + + // Fall back to marker-pdf. let tmp_dir = tempfile::tempdir().context("failed to create temp directory")?; let pdf_path = tmp_dir.path().join("paper.pdf"); let out_dir = tmp_dir.path().join("output"); std::fs::write(&pdf_path, pdf_bytes).context("failed to write temp PDF")?; - eprintln!("Converting PDF to markdown…"); + eprintln!("Converting PDF to markdown with marker…"); let status = Command::new("marker_single") .arg(&pdf_path)