Compare commits
5 commits
a45b8df676
...
91920d4103
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
91920d4103 | ||
|
|
c3b63ea2f5 | ||
|
|
21a61b1c75 | ||
|
|
ff29d6109d | ||
|
|
29b2a6b743 |
5 changed files with 491 additions and 8 deletions
35
docs/plans/2026-02-25-unpaywall-and-pdftotext-design.md
Normal file
35
docs/plans/2026-02-25-unpaywall-and-pdftotext-design.md
Normal file
|
|
@ -0,0 +1,35 @@
|
||||||
|
# Unpaywall + pdftotext — Design
|
||||||
|
|
||||||
|
Two improvements to the paper CLI: try free open-access sources before piracy, and try simple PDF text extraction before heavy ML OCR.
|
||||||
|
|
||||||
|
## Download Pipeline
|
||||||
|
|
||||||
|
Priority chain for fetching the PDF:
|
||||||
|
|
||||||
|
1. **Unpaywall** (new) — hit `https://api.unpaywall.org/v2/{doi}?email={email}`, check `best_oa_location.url_for_pdf`, download if present
|
||||||
|
2. **LibGen** (existing)
|
||||||
|
3. **Anna's Archive** (existing)
|
||||||
|
|
||||||
|
Unpaywall requires an email address (no API key). Read from `UNPAYWALL_EMAIL` env var. If unset, skip silently — same pattern as `ANNAS_ARCHIVE_KEY`.
|
||||||
|
|
||||||
|
## Conversion Pipeline
|
||||||
|
|
||||||
|
Priority chain for PDF-to-markdown:
|
||||||
|
|
||||||
|
1. **pdftotext** (new) — shell out to `pdftotext -layout <pdf> -`, check output quality
|
||||||
|
2. **marker-pdf** (existing) — heavy ML OCR fallback
|
||||||
|
|
||||||
|
### Quality heuristic for pdftotext output
|
||||||
|
|
||||||
|
- Length > 500 characters
|
||||||
|
- >80% printable ASCII / common unicode (letters, digits, punctuation, whitespace)
|
||||||
|
|
||||||
|
If either check fails, or pdftotext isn't on PATH, fall back to marker.
|
||||||
|
|
||||||
|
## Nix changes
|
||||||
|
|
||||||
|
Add `poppler_utils` to the flake for `pdftotext`. Include in both the dev shell and the wrapped binary PATH.
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
No new Rust crate dependencies — `reqwest` and `serde_json` already handle the Unpaywall API call. `pdftotext` is an external binary like `marker_single`.
|
||||||
294
docs/plans/2026-02-25-unpaywall-and-pdftotext-plan.md
Normal file
294
docs/plans/2026-02-25-unpaywall-and-pdftotext-plan.md
Normal file
|
|
@ -0,0 +1,294 @@
|
||||||
|
# Unpaywall + pdftotext Implementation Plan
|
||||||
|
|
||||||
|
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
|
||||||
|
|
||||||
|
**Goal:** Try free open-access sources (Unpaywall) before piracy sources, and try simple PDF text extraction (pdftotext) before heavy ML OCR (marker-pdf).
|
||||||
|
|
||||||
|
**Architecture:** Two insertions into existing pipelines. `download_via_unpaywall` slots in as the first download source before LibGen. `try_pdftotext` slots in as the first conversion method before marker. Both follow existing patterns (shell out for pdftotext, HTTP+JSON for Unpaywall).
|
||||||
|
|
||||||
|
**Tech Stack:** Rust (existing), pdftotext from poppler-utils (new external dep), Unpaywall REST API (new)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 1: Add pdftotext quality heuristic with test
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `src/main.rs`
|
||||||
|
|
||||||
|
**Step 1: Add the quality check function and a test**
|
||||||
|
|
||||||
|
Add this above the `convert_to_markdown` function (around line 295):
|
||||||
|
|
||||||
|
```rust
|
||||||
|
/// Check if pdftotext output is usable: long enough and mostly readable text.
|
||||||
|
fn pdftotext_output_is_good(text: &str) -> bool {
|
||||||
|
if text.len() < 500 {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
let printable = text.chars().filter(|c| {
|
||||||
|
c.is_alphanumeric() || c.is_whitespace() || c.is_ascii_punctuation()
|
||||||
|
}).count();
|
||||||
|
let ratio = printable as f64 / text.chars().count() as f64;
|
||||||
|
ratio > 0.8
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Add a test module at the bottom of the file:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn good_text_passes_quality_check() {
|
||||||
|
let text = "This is a normal academic paper abstract. ".repeat(20);
|
||||||
|
assert!(pdftotext_output_is_good(&text));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn short_text_fails_quality_check() {
|
||||||
|
assert!(!pdftotext_output_is_good("too short"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn garbled_text_fails_quality_check() {
|
||||||
|
let garbled = "\u{fffd}\u{25a0}\u{2588}\u{2591}".repeat(200);
|
||||||
|
assert!(!pdftotext_output_is_good(&garbled));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Run tests**
|
||||||
|
|
||||||
|
Run: `cargo test`
|
||||||
|
Expected: all 3 tests pass.
|
||||||
|
|
||||||
|
**Step 3: Commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
feat: add pdftotext quality heuristic with tests
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 2: Add try_pdftotext conversion path
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `src/main.rs`
|
||||||
|
|
||||||
|
**Step 1: Add the try_pdftotext function**
|
||||||
|
|
||||||
|
Add this above `convert_to_markdown`:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
/// Try extracting text from a PDF using pdftotext.
|
||||||
|
/// Returns Some(text) if pdftotext succeeds and the output looks good,
|
||||||
|
/// None otherwise.
|
||||||
|
fn try_pdftotext(pdf_bytes: &[u8]) -> Option<String> {
|
||||||
|
let tmp_dir = tempfile::tempdir().ok()?;
|
||||||
|
let pdf_path = tmp_dir.path().join("paper.pdf");
|
||||||
|
std::fs::write(&pdf_path, pdf_bytes).ok()?;
|
||||||
|
|
||||||
|
eprintln!("Trying pdftotext…");
|
||||||
|
|
||||||
|
let output = Command::new("pdftotext")
|
||||||
|
.arg("-layout")
|
||||||
|
.arg(&pdf_path)
|
||||||
|
.arg("-")
|
||||||
|
.output();
|
||||||
|
|
||||||
|
match output {
|
||||||
|
Err(e) if e.kind() == io::ErrorKind::NotFound => {
|
||||||
|
eprintln!("pdftotext not found on PATH, skipping simple extraction");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("pdftotext failed: {e}");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
Ok(o) if !o.status.success() => {
|
||||||
|
eprintln!("pdftotext exited with {}", o.status);
|
||||||
|
None
|
||||||
|
}
|
||||||
|
Ok(o) => {
|
||||||
|
let text = String::from_utf8_lossy(&o.stdout).into_owned();
|
||||||
|
if pdftotext_output_is_good(&text) {
|
||||||
|
eprintln!("pdftotext output looks good, skipping marker");
|
||||||
|
Some(text)
|
||||||
|
} else {
|
||||||
|
eprintln!("pdftotext output is low quality, falling back to marker");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Wire it into convert_to_markdown**
|
||||||
|
|
||||||
|
Replace the beginning of `convert_to_markdown` (the function signature stays the same). The new body:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result<String> {
|
||||||
|
// Try simple extraction first.
|
||||||
|
if let Some(text) = try_pdftotext(pdf_bytes) {
|
||||||
|
return Ok(text);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall back to marker-pdf.
|
||||||
|
let tmp_dir = tempfile::tempdir().context("failed to create temp directory")?;
|
||||||
|
// ... rest of existing marker logic unchanged ...
|
||||||
|
```
|
||||||
|
|
||||||
|
Keep everything from `let pdf_path = tmp_dir.path().join("paper.pdf");` onward unchanged — just move the `tempfile::tempdir()` call below the pdftotext check so we don't create it unnecessarily.
|
||||||
|
|
||||||
|
**Step 3: Build and verify**
|
||||||
|
|
||||||
|
Run: `cargo build`
|
||||||
|
Expected: compiles cleanly.
|
||||||
|
|
||||||
|
**Step 4: Commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
feat: try pdftotext before marker-pdf for conversion
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 3: Add download_via_unpaywall
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `src/main.rs`
|
||||||
|
|
||||||
|
**Step 1: Add the Unpaywall download function**
|
||||||
|
|
||||||
|
Add a new section after the comment `// -- Helpers` block (before `validate_pdf`), or better, add a new section comment and function after the Anna's Archive section (after line 277):
|
||||||
|
|
||||||
|
```rust
|
||||||
|
// -- Unpaywall ---------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Try downloading an open-access PDF via the Unpaywall API.
|
||||||
|
fn download_via_unpaywall(
|
||||||
|
client: &reqwest::blocking::Client,
|
||||||
|
doi: &str,
|
||||||
|
) -> anyhow::Result<Vec<u8>> {
|
||||||
|
let email = std::env::var("UNPAYWALL_EMAIL")
|
||||||
|
.context("UNPAYWALL_EMAIL not set")?;
|
||||||
|
|
||||||
|
eprintln!("Checking Unpaywall for open-access PDF…");
|
||||||
|
|
||||||
|
let api_url = format!(
|
||||||
|
"https://api.unpaywall.org/v2/{doi}?email={email}"
|
||||||
|
);
|
||||||
|
|
||||||
|
let resp: serde_json::Value = client
|
||||||
|
.get(&api_url)
|
||||||
|
.send()
|
||||||
|
.context("failed to call Unpaywall API")?
|
||||||
|
.json()
|
||||||
|
.context("failed to parse Unpaywall API response")?;
|
||||||
|
|
||||||
|
let pdf_url = resp
|
||||||
|
.get("best_oa_location")
|
||||||
|
.and_then(|loc| loc.get("url_for_pdf"))
|
||||||
|
.and_then(|u| u.as_str())
|
||||||
|
.context("no open-access PDF available via Unpaywall")?;
|
||||||
|
|
||||||
|
eprintln!("Downloading open-access PDF from {pdf_url}");
|
||||||
|
let bytes = client
|
||||||
|
.get(pdf_url)
|
||||||
|
.send()
|
||||||
|
.context("failed to download from Unpaywall PDF URL")?
|
||||||
|
.bytes()?;
|
||||||
|
|
||||||
|
validate_pdf(&bytes)?;
|
||||||
|
Ok(bytes.to_vec())
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Wire it into download_pdf as the first source**
|
||||||
|
|
||||||
|
In `download_pdf`, add the Unpaywall attempt before the LibGen attempt:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
fn download_pdf(doi: &str) -> anyhow::Result<Vec<u8>> {
|
||||||
|
let client = http_client()?;
|
||||||
|
|
||||||
|
// Try Unpaywall first (free open-access).
|
||||||
|
match download_via_unpaywall(&client, doi) {
|
||||||
|
Ok(bytes) => return Ok(bytes),
|
||||||
|
Err(e) => eprintln!("Unpaywall: {e:#}"),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try LibGen.
|
||||||
|
match download_via_libgen(&client, doi) {
|
||||||
|
// ... rest unchanged
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 3: Build and verify**
|
||||||
|
|
||||||
|
Run: `cargo build`
|
||||||
|
Expected: compiles cleanly.
|
||||||
|
|
||||||
|
**Step 4: Commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
feat: try Unpaywall for open-access PDFs before LibGen
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 4: Add poppler-utils to flake.nix
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `flake.nix`
|
||||||
|
|
||||||
|
**Step 1: Add poppler_utils to the wrapped binary PATH**
|
||||||
|
|
||||||
|
In the `paper-wrapped` definition, add `pkgs.poppler_utils` to the `makeBinPath` list:
|
||||||
|
|
||||||
|
```nix
|
||||||
|
--prefix PATH : ${pkgs.lib.makeBinPath [ marker.markerEnv pkgs.poppler_utils ]} \
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Add poppler_utils to the devShell**
|
||||||
|
|
||||||
|
```nix
|
||||||
|
devShells.default = pkgs.mkShell {
|
||||||
|
buildInputs = [
|
||||||
|
rust-nightly
|
||||||
|
marker.markerEnv
|
||||||
|
pkgs.poppler_utils
|
||||||
|
];
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 3: Verify the flake evaluates**
|
||||||
|
|
||||||
|
Run: `nix flake check`
|
||||||
|
Expected: no errors.
|
||||||
|
|
||||||
|
**Step 4: Commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
feat: add poppler-utils (pdftotext) to nix flake
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 5: Update skill and design docs
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `skills/paper-reader/SKILL.md` — mention that simple PDFs are extracted directly, heavy OCR only used when needed
|
||||||
|
- Modify: `docs/plans/2026-02-25-unpaywall-and-pdftotext-design.md` — mark as implemented
|
||||||
|
|
||||||
|
**Step 1: Update SKILL.md**
|
||||||
|
|
||||||
|
Add a note in the "How it works" section that the tool tries Unpaywall for open-access papers first, and uses simple text extraction before falling back to ML OCR. Mention `UNPAYWALL_EMAIL` env var.
|
||||||
|
|
||||||
|
**Step 2: Commit**
|
||||||
|
|
||||||
|
```
|
||||||
|
docs: update skill docs for unpaywall and pdftotext support
|
||||||
|
```
|
||||||
16
flake.nix
16
flake.nix
|
|
@ -21,6 +21,17 @@
|
||||||
|
|
||||||
marker = import ./nix/marker.nix { inherit pkgs; };
|
marker = import ./nix/marker.nix { inherit pkgs; };
|
||||||
|
|
||||||
|
# Pre-download the font marker needs so it doesn't try to write
|
||||||
|
# into the read-only nix store at runtime.
|
||||||
|
marker-font = pkgs.fetchurl {
|
||||||
|
url = "https://models.datalab.to/artifacts/GoNotoCurrent-Regular.ttf";
|
||||||
|
hash = "sha256-iCr7q5ZWCMLSvGJ/2AFrliqlpr4tNY+d4kp7WWfFYy4=";
|
||||||
|
};
|
||||||
|
marker-font-dir = pkgs.runCommand "marker-font-dir" {} ''
|
||||||
|
mkdir -p $out
|
||||||
|
ln -s ${marker-font} $out/GoNotoCurrent-Regular.ttf
|
||||||
|
'';
|
||||||
|
|
||||||
paper = pkgs.rustPlatform.buildRustPackage {
|
paper = pkgs.rustPlatform.buildRustPackage {
|
||||||
pname = "paper";
|
pname = "paper";
|
||||||
version = "0.1.0";
|
version = "0.1.0";
|
||||||
|
|
@ -35,7 +46,9 @@
|
||||||
nativeBuildInputs = [ pkgs.makeWrapper ];
|
nativeBuildInputs = [ pkgs.makeWrapper ];
|
||||||
postBuild = ''
|
postBuild = ''
|
||||||
wrapProgram $out/bin/paper \
|
wrapProgram $out/bin/paper \
|
||||||
--prefix PATH : ${pkgs.lib.makeBinPath [ marker.markerEnv ]}
|
--prefix PATH : ${pkgs.lib.makeBinPath [ marker.markerEnv pkgs.poppler-utils ]} \
|
||||||
|
--set FONT_DIR "${marker-font-dir}" \
|
||||||
|
--set FONT_PATH "${marker-font-dir}/GoNotoCurrent-Regular.ttf"
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
in
|
in
|
||||||
|
|
@ -49,6 +62,7 @@
|
||||||
buildInputs = [
|
buildInputs = [
|
||||||
rust-nightly
|
rust-nightly
|
||||||
marker.markerEnv
|
marker.markerEnv
|
||||||
|
pkgs.poppler-utils
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ description: Fetch and read academic papers by DOI. Use when (1) the user mentio
|
||||||
|
|
||||||
# Paper Reader
|
# Paper Reader
|
||||||
|
|
||||||
Fetch academic papers by DOI using the `paper` CLI, which downloads PDFs and converts them to markdown via `marker_single`.
|
Fetch academic papers by DOI using the `paper` CLI, which downloads PDFs and converts them to markdown. Uses simple text extraction (`pdftotext`) when possible, falling back to ML OCR (`marker_single`) for scanned or image-heavy papers.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
|
@ -32,7 +32,15 @@ Results are cached at `~/.cache/paper/<DOI>.md`. Subsequent requests for the sam
|
||||||
|
|
||||||
## Download Sources
|
## Download Sources
|
||||||
|
|
||||||
The tool tries LibGen first (free, no authentication), then falls back to Anna's Archive fast download API if `ANNAS_ARCHIVE_KEY` is set.
|
The tool tries sources in this order:
|
||||||
|
|
||||||
|
1. **Unpaywall** — free open-access PDFs (requires `UNPAYWALL_EMAIL` env var)
|
||||||
|
2. **LibGen** — free, no authentication
|
||||||
|
3. **Anna's Archive** — fast download API (requires `ANNAS_ARCHIVE_KEY` env var)
|
||||||
|
|
||||||
|
## Conversion
|
||||||
|
|
||||||
|
PDF-to-markdown conversion tries simple text extraction first (`pdftotext`), which works well for most modern papers with proper text layers. If the output is low quality (garbled or too short), it falls back to ML OCR via `marker_single`.
|
||||||
|
|
||||||
## Errors
|
## Errors
|
||||||
|
|
||||||
|
|
|
||||||
142
src/main.rs
142
src/main.rs
|
|
@ -89,13 +89,19 @@ fn http_client() -> anyhow::Result<reqwest::blocking::Client> {
|
||||||
|
|
||||||
/// Download a paper PDF by DOI.
|
/// Download a paper PDF by DOI.
|
||||||
///
|
///
|
||||||
/// Tries LibGen first (free, no JS challenge). If that fails and an Anna's
|
/// Tries Unpaywall first (free open-access, requires `UNPAYWALL_EMAIL`).
|
||||||
/// Archive API key is configured via `ANNAS_ARCHIVE_KEY`, tries the fast
|
/// Falls back to LibGen (free, no JS challenge), then Anna's Archive if an
|
||||||
/// download API as a fallback.
|
/// API key is configured via `ANNAS_ARCHIVE_KEY`.
|
||||||
fn download_pdf(doi: &str) -> anyhow::Result<Vec<u8>> {
|
fn download_pdf(doi: &str) -> anyhow::Result<Vec<u8>> {
|
||||||
let client = http_client()?;
|
let client = http_client()?;
|
||||||
|
|
||||||
// Try LibGen first.
|
// Try Unpaywall first (free open-access).
|
||||||
|
match download_via_unpaywall(&client, doi) {
|
||||||
|
Ok(bytes) => return Ok(bytes),
|
||||||
|
Err(e) => eprintln!("Unpaywall: {e:#}"),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try LibGen.
|
||||||
match download_via_libgen(&client, doi) {
|
match download_via_libgen(&client, doi) {
|
||||||
Ok(bytes) => return Ok(bytes),
|
Ok(bytes) => return Ok(bytes),
|
||||||
Err(e) => eprintln!("LibGen failed: {e:#}"),
|
Err(e) => eprintln!("LibGen failed: {e:#}"),
|
||||||
|
|
@ -276,6 +282,50 @@ fn download_via_annas_archive(
|
||||||
Ok(bytes.to_vec())
|
Ok(bytes.to_vec())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// -- Unpaywall ---------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Try downloading an open-access PDF via the Unpaywall API.
|
||||||
|
fn download_via_unpaywall(
|
||||||
|
client: &reqwest::blocking::Client,
|
||||||
|
doi: &str,
|
||||||
|
) -> anyhow::Result<Vec<u8>> {
|
||||||
|
let email = std::env::var("UNPAYWALL_EMAIL")
|
||||||
|
.context("UNPAYWALL_EMAIL not set")?;
|
||||||
|
|
||||||
|
eprintln!("Checking Unpaywall for open-access PDF…");
|
||||||
|
|
||||||
|
let api_url = format!(
|
||||||
|
"https://api.unpaywall.org/v2/{doi}?email={email}"
|
||||||
|
);
|
||||||
|
|
||||||
|
let resp: serde_json::Value = client
|
||||||
|
.get(&api_url)
|
||||||
|
.send()
|
||||||
|
.context("failed to call Unpaywall API")?
|
||||||
|
.error_for_status()
|
||||||
|
.context("Unpaywall API returned an error status")?
|
||||||
|
.json()
|
||||||
|
.context("failed to parse Unpaywall API response")?;
|
||||||
|
|
||||||
|
let pdf_url = resp
|
||||||
|
.get("best_oa_location")
|
||||||
|
.and_then(|loc| loc.get("url_for_pdf"))
|
||||||
|
.and_then(|u| u.as_str())
|
||||||
|
.context("no open-access PDF available via Unpaywall")?;
|
||||||
|
|
||||||
|
eprintln!("Downloading open-access PDF from {pdf_url}");
|
||||||
|
let bytes = client
|
||||||
|
.get(pdf_url)
|
||||||
|
.send()
|
||||||
|
.context("failed to download from Unpaywall PDF URL")?
|
||||||
|
.error_for_status()
|
||||||
|
.context("Unpaywall PDF URL returned an error status")?
|
||||||
|
.bytes()?;
|
||||||
|
|
||||||
|
validate_pdf(&bytes)?;
|
||||||
|
Ok(bytes.to_vec())
|
||||||
|
}
|
||||||
|
|
||||||
// -- Helpers ----------------------------------------------------------------
|
// -- Helpers ----------------------------------------------------------------
|
||||||
|
|
||||||
fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> {
|
fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> {
|
||||||
|
|
@ -292,16 +342,76 @@ fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> {
|
||||||
// Conversion
|
// Conversion
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Check if pdftotext output is usable: long enough and mostly readable text.
|
||||||
|
fn pdftotext_output_is_good(text: &str) -> bool {
|
||||||
|
if text.len() < 500 {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
let printable = text.chars().filter(|c| {
|
||||||
|
c.is_alphanumeric() || c.is_whitespace() || c.is_ascii_punctuation()
|
||||||
|
}).count();
|
||||||
|
let ratio = printable as f64 / text.chars().count() as f64;
|
||||||
|
ratio > 0.8
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Try extracting text from a PDF using pdftotext.
|
||||||
|
/// Returns Some(text) if pdftotext succeeds and the output looks good,
|
||||||
|
/// None otherwise.
|
||||||
|
fn try_pdftotext(pdf_bytes: &[u8]) -> Option<String> {
|
||||||
|
let tmp_dir = tempfile::tempdir().ok()?;
|
||||||
|
let pdf_path = tmp_dir.path().join("paper.pdf");
|
||||||
|
std::fs::write(&pdf_path, pdf_bytes).ok()?;
|
||||||
|
|
||||||
|
eprintln!("Trying pdftotext…");
|
||||||
|
|
||||||
|
let output = Command::new("pdftotext")
|
||||||
|
.arg("-layout")
|
||||||
|
.arg(&pdf_path)
|
||||||
|
.arg("-")
|
||||||
|
.output();
|
||||||
|
|
||||||
|
match output {
|
||||||
|
Err(e) if e.kind() == io::ErrorKind::NotFound => {
|
||||||
|
eprintln!("pdftotext not found on PATH, skipping simple extraction");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("pdftotext failed: {e}");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
Ok(o) if !o.status.success() => {
|
||||||
|
eprintln!("pdftotext exited with {}", o.status);
|
||||||
|
None
|
||||||
|
}
|
||||||
|
Ok(o) => {
|
||||||
|
let text = String::from_utf8_lossy(&o.stdout).into_owned();
|
||||||
|
if pdftotext_output_is_good(&text) {
|
||||||
|
eprintln!("pdftotext output looks good, skipping marker");
|
||||||
|
Some(text)
|
||||||
|
} else {
|
||||||
|
eprintln!("pdftotext output is low quality, falling back to marker");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Write PDF bytes to a temp file, run marker_single, and return the
|
/// Write PDF bytes to a temp file, run marker_single, and return the
|
||||||
/// resulting markdown.
|
/// resulting markdown.
|
||||||
fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result<String> {
|
fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result<String> {
|
||||||
|
// Try simple extraction first.
|
||||||
|
if let Some(text) = try_pdftotext(pdf_bytes) {
|
||||||
|
return Ok(text);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall back to marker-pdf.
|
||||||
let tmp_dir = tempfile::tempdir().context("failed to create temp directory")?;
|
let tmp_dir = tempfile::tempdir().context("failed to create temp directory")?;
|
||||||
let pdf_path = tmp_dir.path().join("paper.pdf");
|
let pdf_path = tmp_dir.path().join("paper.pdf");
|
||||||
let out_dir = tmp_dir.path().join("output");
|
let out_dir = tmp_dir.path().join("output");
|
||||||
|
|
||||||
std::fs::write(&pdf_path, pdf_bytes).context("failed to write temp PDF")?;
|
std::fs::write(&pdf_path, pdf_bytes).context("failed to write temp PDF")?;
|
||||||
|
|
||||||
eprintln!("Converting PDF to markdown…");
|
eprintln!("Converting PDF to markdown with marker…");
|
||||||
|
|
||||||
let status = Command::new("marker_single")
|
let status = Command::new("marker_single")
|
||||||
.arg(&pdf_path)
|
.arg(&pdf_path)
|
||||||
|
|
@ -342,3 +452,25 @@ fn find_markdown_file(dir: &std::path::Path) -> anyhow::Result<String> {
|
||||||
}
|
}
|
||||||
bail!("no .md file found in marker output")
|
bail!("no .md file found in marker output")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn good_text_passes_quality_check() {
|
||||||
|
let text = "This is a normal academic paper abstract. ".repeat(20);
|
||||||
|
assert!(pdftotext_output_is_good(&text));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn short_text_fails_quality_check() {
|
||||||
|
assert!(!pdftotext_output_is_good("too short"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn garbled_text_fails_quality_check() {
|
||||||
|
let garbled = "\u{fffd}\u{25a0}\u{2588}\u{2591}".repeat(200);
|
||||||
|
assert!(!pdftotext_output_is_good(&garbled));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue