From 29b2a6b743e34920839163b7111608a324b5d545 Mon Sep 17 00:00:00 2001 From: Ellie <6687206+wizzeh@users.noreply.github.com> Date: Wed, 25 Feb 2026 13:21:51 -0800 Subject: [PATCH] feat: add pdftotext quality heuristic with tests Co-Authored-By: Claude Opus 4.6 --- src/main.rs | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/main.rs b/src/main.rs index 7c760ef..baab9fc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -292,6 +292,18 @@ fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> { // Conversion // --------------------------------------------------------------------------- +/// Check if pdftotext output is usable: long enough and mostly readable text. +fn pdftotext_output_is_good(text: &str) -> bool { + if text.len() < 500 { + return false; + } + let printable = text.chars().filter(|c| { + c.is_alphanumeric() || c.is_whitespace() || c.is_ascii_punctuation() + }).count(); + let ratio = printable as f64 / text.chars().count() as f64; + ratio > 0.8 +} + /// Write PDF bytes to a temp file, run marker_single, and return the /// resulting markdown. fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result { @@ -342,3 +354,25 @@ fn find_markdown_file(dir: &std::path::Path) -> anyhow::Result { } bail!("no .md file found in marker output") } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn good_text_passes_quality_check() { + let text = "This is a normal academic paper abstract. ".repeat(20); + assert!(pdftotext_output_is_good(&text)); + } + + #[test] + fn short_text_fails_quality_check() { + assert!(!pdftotext_output_is_good("too short")); + } + + #[test] + fn garbled_text_fails_quality_check() { + let garbled = "\u{fffd}\u{25a0}\u{2588}\u{2591}".repeat(200); + assert!(!pdftotext_output_is_good(&garbled)); + } +}