feat: add pdftotext quality heuristic with tests
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
a45b8df676
commit
29b2a6b743
1 changed files with 34 additions and 0 deletions
34
src/main.rs
34
src/main.rs
|
|
@ -292,6 +292,18 @@ fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> {
|
|||
// Conversion
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Check if pdftotext output is usable: long enough and mostly readable text.
|
||||
fn pdftotext_output_is_good(text: &str) -> bool {
|
||||
if text.len() < 500 {
|
||||
return false;
|
||||
}
|
||||
let printable = text.chars().filter(|c| {
|
||||
c.is_alphanumeric() || c.is_whitespace() || c.is_ascii_punctuation()
|
||||
}).count();
|
||||
let ratio = printable as f64 / text.chars().count() as f64;
|
||||
ratio > 0.8
|
||||
}
|
||||
|
||||
/// Write PDF bytes to a temp file, run marker_single, and return the
|
||||
/// resulting markdown.
|
||||
fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result<String> {
|
||||
|
|
@ -342,3 +354,25 @@ fn find_markdown_file(dir: &std::path::Path) -> anyhow::Result<String> {
|
|||
}
|
||||
bail!("no .md file found in marker output")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn good_text_passes_quality_check() {
|
||||
let text = "This is a normal academic paper abstract. ".repeat(20);
|
||||
assert!(pdftotext_output_is_good(&text));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn short_text_fails_quality_check() {
|
||||
assert!(!pdftotext_output_is_good("too short"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn garbled_text_fails_quality_check() {
|
||||
let garbled = "\u{fffd}\u{25a0}\u{2588}\u{2591}".repeat(200);
|
||||
assert!(!pdftotext_output_is_good(&garbled));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue