feat: try pdftotext before marker-pdf for conversion
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
29b2a6b743
commit
ff29d6109d
1 changed files with 49 additions and 1 deletions
50
src/main.rs
50
src/main.rs
|
|
@ -304,16 +304,64 @@ fn pdftotext_output_is_good(text: &str) -> bool {
|
||||||
ratio > 0.8
|
ratio > 0.8
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Try extracting text from a PDF using pdftotext.
|
||||||
|
/// Returns Some(text) if pdftotext succeeds and the output looks good,
|
||||||
|
/// None otherwise.
|
||||||
|
fn try_pdftotext(pdf_bytes: &[u8]) -> Option<String> {
|
||||||
|
let tmp_dir = tempfile::tempdir().ok()?;
|
||||||
|
let pdf_path = tmp_dir.path().join("paper.pdf");
|
||||||
|
std::fs::write(&pdf_path, pdf_bytes).ok()?;
|
||||||
|
|
||||||
|
eprintln!("Trying pdftotext…");
|
||||||
|
|
||||||
|
let output = Command::new("pdftotext")
|
||||||
|
.arg("-layout")
|
||||||
|
.arg(&pdf_path)
|
||||||
|
.arg("-")
|
||||||
|
.output();
|
||||||
|
|
||||||
|
match output {
|
||||||
|
Err(e) if e.kind() == io::ErrorKind::NotFound => {
|
||||||
|
eprintln!("pdftotext not found on PATH, skipping simple extraction");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("pdftotext failed: {e}");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
Ok(o) if !o.status.success() => {
|
||||||
|
eprintln!("pdftotext exited with {}", o.status);
|
||||||
|
None
|
||||||
|
}
|
||||||
|
Ok(o) => {
|
||||||
|
let text = String::from_utf8_lossy(&o.stdout).into_owned();
|
||||||
|
if pdftotext_output_is_good(&text) {
|
||||||
|
eprintln!("pdftotext output looks good, skipping marker");
|
||||||
|
Some(text)
|
||||||
|
} else {
|
||||||
|
eprintln!("pdftotext output is low quality, falling back to marker");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Write PDF bytes to a temp file, run marker_single, and return the
|
/// Write PDF bytes to a temp file, run marker_single, and return the
|
||||||
/// resulting markdown.
|
/// resulting markdown.
|
||||||
fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result<String> {
|
fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result<String> {
|
||||||
|
// Try simple extraction first.
|
||||||
|
if let Some(text) = try_pdftotext(pdf_bytes) {
|
||||||
|
return Ok(text);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall back to marker-pdf.
|
||||||
let tmp_dir = tempfile::tempdir().context("failed to create temp directory")?;
|
let tmp_dir = tempfile::tempdir().context("failed to create temp directory")?;
|
||||||
let pdf_path = tmp_dir.path().join("paper.pdf");
|
let pdf_path = tmp_dir.path().join("paper.pdf");
|
||||||
let out_dir = tmp_dir.path().join("output");
|
let out_dir = tmp_dir.path().join("output");
|
||||||
|
|
||||||
std::fs::write(&pdf_path, pdf_bytes).context("failed to write temp PDF")?;
|
std::fs::write(&pdf_path, pdf_bytes).context("failed to write temp PDF")?;
|
||||||
|
|
||||||
eprintln!("Converting PDF to markdown…");
|
eprintln!("Converting PDF to markdown with marker…");
|
||||||
|
|
||||||
let status = Command::new("marker_single")
|
let status = Command::new("marker_single")
|
||||||
.arg(&pdf_path)
|
.arg(&pdf_path)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue