feat: add pdftotext quality heuristic with tests

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Ellie 2026-02-25 13:21:51 -08:00
parent a45b8df676
commit 29b2a6b743

View file

@ -292,6 +292,18 @@ fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> {
// Conversion // Conversion
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
/// Check if pdftotext output is usable: long enough and mostly readable text.
fn pdftotext_output_is_good(text: &str) -> bool {
if text.len() < 500 {
return false;
}
let printable = text.chars().filter(|c| {
c.is_alphanumeric() || c.is_whitespace() || c.is_ascii_punctuation()
}).count();
let ratio = printable as f64 / text.chars().count() as f64;
ratio > 0.8
}
/// Write PDF bytes to a temp file, run marker_single, and return the /// Write PDF bytes to a temp file, run marker_single, and return the
/// resulting markdown. /// resulting markdown.
fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result<String> { fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result<String> {
@ -342,3 +354,25 @@ fn find_markdown_file(dir: &std::path::Path) -> anyhow::Result<String> {
} }
bail!("no .md file found in marker output") bail!("no .md file found in marker output")
} }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn good_text_passes_quality_check() {
let text = "This is a normal academic paper abstract. ".repeat(20);
assert!(pdftotext_output_is_good(&text));
}
#[test]
fn short_text_fails_quality_check() {
assert!(!pdftotext_output_is_good("too short"));
}
#[test]
fn garbled_text_fails_quality_check() {
let garbled = "\u{fffd}\u{25a0}\u{2588}\u{2591}".repeat(200);
assert!(!pdftotext_output_is_good(&garbled));
}
}