feat: add pdftotext quality heuristic with tests
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
a45b8df676
commit
29b2a6b743
1 changed files with 34 additions and 0 deletions
34
src/main.rs
34
src/main.rs
|
|
@ -292,6 +292,18 @@ fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> {
|
||||||
// Conversion
|
// Conversion
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Check if pdftotext output is usable: long enough and mostly readable text.
|
||||||
|
fn pdftotext_output_is_good(text: &str) -> bool {
|
||||||
|
if text.len() < 500 {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
let printable = text.chars().filter(|c| {
|
||||||
|
c.is_alphanumeric() || c.is_whitespace() || c.is_ascii_punctuation()
|
||||||
|
}).count();
|
||||||
|
let ratio = printable as f64 / text.chars().count() as f64;
|
||||||
|
ratio > 0.8
|
||||||
|
}
|
||||||
|
|
||||||
/// Write PDF bytes to a temp file, run marker_single, and return the
|
/// Write PDF bytes to a temp file, run marker_single, and return the
|
||||||
/// resulting markdown.
|
/// resulting markdown.
|
||||||
fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result<String> {
|
fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result<String> {
|
||||||
|
|
@ -342,3 +354,25 @@ fn find_markdown_file(dir: &std::path::Path) -> anyhow::Result<String> {
|
||||||
}
|
}
|
||||||
bail!("no .md file found in marker output")
|
bail!("no .md file found in marker output")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn good_text_passes_quality_check() {
|
||||||
|
let text = "This is a normal academic paper abstract. ".repeat(20);
|
||||||
|
assert!(pdftotext_output_is_good(&text));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn short_text_fails_quality_check() {
|
||||||
|
assert!(!pdftotext_output_is_good("too short"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn garbled_text_fails_quality_check() {
|
||||||
|
let garbled = "\u{fffd}\u{25a0}\u{2588}\u{2591}".repeat(200);
|
||||||
|
assert!(!pdftotext_output_is_good(&garbled));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue