feat: try pdftotext before marker-pdf for conversion

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Ellie 2026-02-25 13:25:50 -08:00
parent 29b2a6b743
commit ff29d6109d

View file

@ -304,16 +304,64 @@ fn pdftotext_output_is_good(text: &str) -> bool {
ratio > 0.8 ratio > 0.8
} }
/// Try extracting text from a PDF using pdftotext.
/// Returns Some(text) if pdftotext succeeds and the output looks good,
/// None otherwise.
fn try_pdftotext(pdf_bytes: &[u8]) -> Option<String> {
let tmp_dir = tempfile::tempdir().ok()?;
let pdf_path = tmp_dir.path().join("paper.pdf");
std::fs::write(&pdf_path, pdf_bytes).ok()?;
eprintln!("Trying pdftotext…");
let output = Command::new("pdftotext")
.arg("-layout")
.arg(&pdf_path)
.arg("-")
.output();
match output {
Err(e) if e.kind() == io::ErrorKind::NotFound => {
eprintln!("pdftotext not found on PATH, skipping simple extraction");
None
}
Err(e) => {
eprintln!("pdftotext failed: {e}");
None
}
Ok(o) if !o.status.success() => {
eprintln!("pdftotext exited with {}", o.status);
None
}
Ok(o) => {
let text = String::from_utf8_lossy(&o.stdout).into_owned();
if pdftotext_output_is_good(&text) {
eprintln!("pdftotext output looks good, skipping marker");
Some(text)
} else {
eprintln!("pdftotext output is low quality, falling back to marker");
None
}
}
}
}
/// Write PDF bytes to a temp file, run marker_single, and return the /// Write PDF bytes to a temp file, run marker_single, and return the
/// resulting markdown. /// resulting markdown.
fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result<String> { fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result<String> {
// Try simple extraction first.
if let Some(text) = try_pdftotext(pdf_bytes) {
return Ok(text);
}
// Fall back to marker-pdf.
let tmp_dir = tempfile::tempdir().context("failed to create temp directory")?; let tmp_dir = tempfile::tempdir().context("failed to create temp directory")?;
let pdf_path = tmp_dir.path().join("paper.pdf"); let pdf_path = tmp_dir.path().join("paper.pdf");
let out_dir = tmp_dir.path().join("output"); let out_dir = tmp_dir.path().join("output");
std::fs::write(&pdf_path, pdf_bytes).context("failed to write temp PDF")?; std::fs::write(&pdf_path, pdf_bytes).context("failed to write temp PDF")?;
eprintln!("Converting PDF to markdown…"); eprintln!("Converting PDF to markdown with marker");
let status = Command::new("marker_single") let status = Command::new("marker_single")
.arg(&pdf_path) .arg(&pdf_path)