diff --git a/src/main.rs b/src/main.rs index 7c760ef..baab9fc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -292,6 +292,18 @@ fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> { // Conversion // --------------------------------------------------------------------------- +/// Check if pdftotext output is usable: long enough and mostly readable text. +fn pdftotext_output_is_good(text: &str) -> bool { + if text.len() < 500 { + return false; + } + let printable = text.chars().filter(|c| { + c.is_alphanumeric() || c.is_whitespace() || c.is_ascii_punctuation() + }).count(); + let ratio = printable as f64 / text.chars().count() as f64; + ratio > 0.8 +} + /// Write PDF bytes to a temp file, run marker_single, and return the /// resulting markdown. fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result { @@ -342,3 +354,25 @@ fn find_markdown_file(dir: &std::path::Path) -> anyhow::Result { } bail!("no .md file found in marker output") } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn good_text_passes_quality_check() { + let text = "This is a normal academic paper abstract. ".repeat(20); + assert!(pdftotext_output_is_good(&text)); + } + + #[test] + fn short_text_fails_quality_check() { + assert!(!pdftotext_output_is_good("too short")); + } + + #[test] + fn garbled_text_fails_quality_check() { + let garbled = "\u{fffd}\u{25a0}\u{2588}\u{2591}".repeat(200); + assert!(!pdftotext_output_is_good(&garbled)); + } +}