diff --git a/docs/plans/2026-02-25-unpaywall-and-pdftotext-design.md b/docs/plans/2026-02-25-unpaywall-and-pdftotext-design.md deleted file mode 100644 index 6178957..0000000 --- a/docs/plans/2026-02-25-unpaywall-and-pdftotext-design.md +++ /dev/null @@ -1,35 +0,0 @@ -# Unpaywall + pdftotext — Design - -Two improvements to the paper CLI: try free open-access sources before piracy, and try simple PDF text extraction before heavy ML OCR. - -## Download Pipeline - -Priority chain for fetching the PDF: - -1. **Unpaywall** (new) — hit `https://api.unpaywall.org/v2/{doi}?email={email}`, check `best_oa_location.url_for_pdf`, download if present -2. **LibGen** (existing) -3. **Anna's Archive** (existing) - -Unpaywall requires an email address (no API key). Read from `UNPAYWALL_EMAIL` env var. If unset, skip silently — same pattern as `ANNAS_ARCHIVE_KEY`. - -## Conversion Pipeline - -Priority chain for PDF-to-markdown: - -1. **pdftotext** (new) — shell out to `pdftotext -layout -`, check output quality -2. **marker-pdf** (existing) — heavy ML OCR fallback - -### Quality heuristic for pdftotext output - -- Length > 500 characters -- >80% printable ASCII / common unicode (letters, digits, punctuation, whitespace) - -If either check fails, or pdftotext isn't on PATH, fall back to marker. - -## Nix changes - -Add `poppler_utils` to the flake for `pdftotext`. Include in both the dev shell and the wrapped binary PATH. - -## Dependencies - -No new Rust crate dependencies — `reqwest` and `serde_json` already handle the Unpaywall API call. `pdftotext` is an external binary like `marker_single`. diff --git a/docs/plans/2026-02-25-unpaywall-and-pdftotext-plan.md b/docs/plans/2026-02-25-unpaywall-and-pdftotext-plan.md deleted file mode 100644 index 1a0a8e4..0000000 --- a/docs/plans/2026-02-25-unpaywall-and-pdftotext-plan.md +++ /dev/null @@ -1,294 +0,0 @@ -# Unpaywall + pdftotext Implementation Plan - -> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. - -**Goal:** Try free open-access sources (Unpaywall) before piracy sources, and try simple PDF text extraction (pdftotext) before heavy ML OCR (marker-pdf). - -**Architecture:** Two insertions into existing pipelines. `download_via_unpaywall` slots in as the first download source before LibGen. `try_pdftotext` slots in as the first conversion method before marker. Both follow existing patterns (shell out for pdftotext, HTTP+JSON for Unpaywall). - -**Tech Stack:** Rust (existing), pdftotext from poppler-utils (new external dep), Unpaywall REST API (new) - ---- - -### Task 1: Add pdftotext quality heuristic with test - -**Files:** -- Modify: `src/main.rs` - -**Step 1: Add the quality check function and a test** - -Add this above the `convert_to_markdown` function (around line 295): - -```rust -/// Check if pdftotext output is usable: long enough and mostly readable text. -fn pdftotext_output_is_good(text: &str) -> bool { - if text.len() < 500 { - return false; - } - let printable = text.chars().filter(|c| { - c.is_alphanumeric() || c.is_whitespace() || c.is_ascii_punctuation() - }).count(); - let ratio = printable as f64 / text.chars().count() as f64; - ratio > 0.8 -} -``` - -Add a test module at the bottom of the file: - -```rust -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn good_text_passes_quality_check() { - let text = "This is a normal academic paper abstract. ".repeat(20); - assert!(pdftotext_output_is_good(&text)); - } - - #[test] - fn short_text_fails_quality_check() { - assert!(!pdftotext_output_is_good("too short")); - } - - #[test] - fn garbled_text_fails_quality_check() { - let garbled = "\u{fffd}\u{25a0}\u{2588}\u{2591}".repeat(200); - assert!(!pdftotext_output_is_good(&garbled)); - } -} -``` - -**Step 2: Run tests** - -Run: `cargo test` -Expected: all 3 tests pass. - -**Step 3: Commit** - -``` -feat: add pdftotext quality heuristic with tests -``` - ---- - -### Task 2: Add try_pdftotext conversion path - -**Files:** -- Modify: `src/main.rs` - -**Step 1: Add the try_pdftotext function** - -Add this above `convert_to_markdown`: - -```rust -/// Try extracting text from a PDF using pdftotext. -/// Returns Some(text) if pdftotext succeeds and the output looks good, -/// None otherwise. -fn try_pdftotext(pdf_bytes: &[u8]) -> Option { - let tmp_dir = tempfile::tempdir().ok()?; - let pdf_path = tmp_dir.path().join("paper.pdf"); - std::fs::write(&pdf_path, pdf_bytes).ok()?; - - eprintln!("Trying pdftotext…"); - - let output = Command::new("pdftotext") - .arg("-layout") - .arg(&pdf_path) - .arg("-") - .output(); - - match output { - Err(e) if e.kind() == io::ErrorKind::NotFound => { - eprintln!("pdftotext not found on PATH, skipping simple extraction"); - None - } - Err(e) => { - eprintln!("pdftotext failed: {e}"); - None - } - Ok(o) if !o.status.success() => { - eprintln!("pdftotext exited with {}", o.status); - None - } - Ok(o) => { - let text = String::from_utf8_lossy(&o.stdout).into_owned(); - if pdftotext_output_is_good(&text) { - eprintln!("pdftotext output looks good, skipping marker"); - Some(text) - } else { - eprintln!("pdftotext output is low quality, falling back to marker"); - None - } - } - } -} -``` - -**Step 2: Wire it into convert_to_markdown** - -Replace the beginning of `convert_to_markdown` (the function signature stays the same). The new body: - -```rust -fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result { - // Try simple extraction first. - if let Some(text) = try_pdftotext(pdf_bytes) { - return Ok(text); - } - - // Fall back to marker-pdf. - let tmp_dir = tempfile::tempdir().context("failed to create temp directory")?; - // ... rest of existing marker logic unchanged ... -``` - -Keep everything from `let pdf_path = tmp_dir.path().join("paper.pdf");` onward unchanged — just move the `tempfile::tempdir()` call below the pdftotext check so we don't create it unnecessarily. - -**Step 3: Build and verify** - -Run: `cargo build` -Expected: compiles cleanly. - -**Step 4: Commit** - -``` -feat: try pdftotext before marker-pdf for conversion -``` - ---- - -### Task 3: Add download_via_unpaywall - -**Files:** -- Modify: `src/main.rs` - -**Step 1: Add the Unpaywall download function** - -Add a new section after the comment `// -- Helpers` block (before `validate_pdf`), or better, add a new section comment and function after the Anna's Archive section (after line 277): - -```rust -// -- Unpaywall --------------------------------------------------------------- - -/// Try downloading an open-access PDF via the Unpaywall API. -fn download_via_unpaywall( - client: &reqwest::blocking::Client, - doi: &str, -) -> anyhow::Result> { - let email = std::env::var("UNPAYWALL_EMAIL") - .context("UNPAYWALL_EMAIL not set")?; - - eprintln!("Checking Unpaywall for open-access PDF…"); - - let api_url = format!( - "https://api.unpaywall.org/v2/{doi}?email={email}" - ); - - let resp: serde_json::Value = client - .get(&api_url) - .send() - .context("failed to call Unpaywall API")? - .json() - .context("failed to parse Unpaywall API response")?; - - let pdf_url = resp - .get("best_oa_location") - .and_then(|loc| loc.get("url_for_pdf")) - .and_then(|u| u.as_str()) - .context("no open-access PDF available via Unpaywall")?; - - eprintln!("Downloading open-access PDF from {pdf_url}"); - let bytes = client - .get(pdf_url) - .send() - .context("failed to download from Unpaywall PDF URL")? - .bytes()?; - - validate_pdf(&bytes)?; - Ok(bytes.to_vec()) -} -``` - -**Step 2: Wire it into download_pdf as the first source** - -In `download_pdf`, add the Unpaywall attempt before the LibGen attempt: - -```rust -fn download_pdf(doi: &str) -> anyhow::Result> { - let client = http_client()?; - - // Try Unpaywall first (free open-access). - match download_via_unpaywall(&client, doi) { - Ok(bytes) => return Ok(bytes), - Err(e) => eprintln!("Unpaywall: {e:#}"), - } - - // Try LibGen. - match download_via_libgen(&client, doi) { - // ... rest unchanged -``` - -**Step 3: Build and verify** - -Run: `cargo build` -Expected: compiles cleanly. - -**Step 4: Commit** - -``` -feat: try Unpaywall for open-access PDFs before LibGen -``` - ---- - -### Task 4: Add poppler-utils to flake.nix - -**Files:** -- Modify: `flake.nix` - -**Step 1: Add poppler_utils to the wrapped binary PATH** - -In the `paper-wrapped` definition, add `pkgs.poppler_utils` to the `makeBinPath` list: - -```nix ---prefix PATH : ${pkgs.lib.makeBinPath [ marker.markerEnv pkgs.poppler_utils ]} \ -``` - -**Step 2: Add poppler_utils to the devShell** - -```nix -devShells.default = pkgs.mkShell { - buildInputs = [ - rust-nightly - marker.markerEnv - pkgs.poppler_utils - ]; -}; -``` - -**Step 3: Verify the flake evaluates** - -Run: `nix flake check` -Expected: no errors. - -**Step 4: Commit** - -``` -feat: add poppler-utils (pdftotext) to nix flake -``` - ---- - -### Task 5: Update skill and design docs - -**Files:** -- Modify: `skills/paper-reader/SKILL.md` — mention that simple PDFs are extracted directly, heavy OCR only used when needed -- Modify: `docs/plans/2026-02-25-unpaywall-and-pdftotext-design.md` — mark as implemented - -**Step 1: Update SKILL.md** - -Add a note in the "How it works" section that the tool tries Unpaywall for open-access papers first, and uses simple text extraction before falling back to ML OCR. Mention `UNPAYWALL_EMAIL` env var. - -**Step 2: Commit** - -``` -docs: update skill docs for unpaywall and pdftotext support -``` diff --git a/flake.nix b/flake.nix index 3d2fc2c..7045f40 100644 --- a/flake.nix +++ b/flake.nix @@ -21,17 +21,6 @@ marker = import ./nix/marker.nix { inherit pkgs; }; - # Pre-download the font marker needs so it doesn't try to write - # into the read-only nix store at runtime. - marker-font = pkgs.fetchurl { - url = "https://models.datalab.to/artifacts/GoNotoCurrent-Regular.ttf"; - hash = "sha256-iCr7q5ZWCMLSvGJ/2AFrliqlpr4tNY+d4kp7WWfFYy4="; - }; - marker-font-dir = pkgs.runCommand "marker-font-dir" {} '' - mkdir -p $out - ln -s ${marker-font} $out/GoNotoCurrent-Regular.ttf - ''; - paper = pkgs.rustPlatform.buildRustPackage { pname = "paper"; version = "0.1.0"; @@ -46,9 +35,7 @@ nativeBuildInputs = [ pkgs.makeWrapper ]; postBuild = '' wrapProgram $out/bin/paper \ - --prefix PATH : ${pkgs.lib.makeBinPath [ marker.markerEnv pkgs.poppler-utils ]} \ - --set FONT_DIR "${marker-font-dir}" \ - --set FONT_PATH "${marker-font-dir}/GoNotoCurrent-Regular.ttf" + --prefix PATH : ${pkgs.lib.makeBinPath [ marker.markerEnv ]} ''; }; in @@ -62,7 +49,6 @@ buildInputs = [ rust-nightly marker.markerEnv - pkgs.poppler-utils ]; }; }); diff --git a/skills/paper-reader/SKILL.md b/skills/paper-reader/SKILL.md index d20c63b..126deec 100644 --- a/skills/paper-reader/SKILL.md +++ b/skills/paper-reader/SKILL.md @@ -5,7 +5,7 @@ description: Fetch and read academic papers by DOI. Use when (1) the user mentio # Paper Reader -Fetch academic papers by DOI using the `paper` CLI, which downloads PDFs and converts them to markdown. Uses simple text extraction (`pdftotext`) when possible, falling back to ML OCR (`marker_single`) for scanned or image-heavy papers. +Fetch academic papers by DOI using the `paper` CLI, which downloads PDFs and converts them to markdown via `marker_single`. ## Usage @@ -32,15 +32,7 @@ Results are cached at `~/.cache/paper/.md`. Subsequent requests for the sam ## Download Sources -The tool tries sources in this order: - -1. **Unpaywall** — free open-access PDFs (requires `UNPAYWALL_EMAIL` env var) -2. **LibGen** — free, no authentication -3. **Anna's Archive** — fast download API (requires `ANNAS_ARCHIVE_KEY` env var) - -## Conversion - -PDF-to-markdown conversion tries simple text extraction first (`pdftotext`), which works well for most modern papers with proper text layers. If the output is low quality (garbled or too short), it falls back to ML OCR via `marker_single`. +The tool tries LibGen first (free, no authentication), then falls back to Anna's Archive fast download API if `ANNAS_ARCHIVE_KEY` is set. ## Errors diff --git a/src/main.rs b/src/main.rs index 59a6759..7c760ef 100644 --- a/src/main.rs +++ b/src/main.rs @@ -89,19 +89,13 @@ fn http_client() -> anyhow::Result { /// Download a paper PDF by DOI. /// -/// Tries Unpaywall first (free open-access, requires `UNPAYWALL_EMAIL`). -/// Falls back to LibGen (free, no JS challenge), then Anna's Archive if an -/// API key is configured via `ANNAS_ARCHIVE_KEY`. +/// Tries LibGen first (free, no JS challenge). If that fails and an Anna's +/// Archive API key is configured via `ANNAS_ARCHIVE_KEY`, tries the fast +/// download API as a fallback. fn download_pdf(doi: &str) -> anyhow::Result> { let client = http_client()?; - // Try Unpaywall first (free open-access). - match download_via_unpaywall(&client, doi) { - Ok(bytes) => return Ok(bytes), - Err(e) => eprintln!("Unpaywall: {e:#}"), - } - - // Try LibGen. + // Try LibGen first. match download_via_libgen(&client, doi) { Ok(bytes) => return Ok(bytes), Err(e) => eprintln!("LibGen failed: {e:#}"), @@ -282,50 +276,6 @@ fn download_via_annas_archive( Ok(bytes.to_vec()) } -// -- Unpaywall --------------------------------------------------------------- - -/// Try downloading an open-access PDF via the Unpaywall API. -fn download_via_unpaywall( - client: &reqwest::blocking::Client, - doi: &str, -) -> anyhow::Result> { - let email = std::env::var("UNPAYWALL_EMAIL") - .context("UNPAYWALL_EMAIL not set")?; - - eprintln!("Checking Unpaywall for open-access PDF…"); - - let api_url = format!( - "https://api.unpaywall.org/v2/{doi}?email={email}" - ); - - let resp: serde_json::Value = client - .get(&api_url) - .send() - .context("failed to call Unpaywall API")? - .error_for_status() - .context("Unpaywall API returned an error status")? - .json() - .context("failed to parse Unpaywall API response")?; - - let pdf_url = resp - .get("best_oa_location") - .and_then(|loc| loc.get("url_for_pdf")) - .and_then(|u| u.as_str()) - .context("no open-access PDF available via Unpaywall")?; - - eprintln!("Downloading open-access PDF from {pdf_url}"); - let bytes = client - .get(pdf_url) - .send() - .context("failed to download from Unpaywall PDF URL")? - .error_for_status() - .context("Unpaywall PDF URL returned an error status")? - .bytes()?; - - validate_pdf(&bytes)?; - Ok(bytes.to_vec()) -} - // -- Helpers ---------------------------------------------------------------- fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> { @@ -342,76 +292,16 @@ fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> { // Conversion // --------------------------------------------------------------------------- -/// Check if pdftotext output is usable: long enough and mostly readable text. -fn pdftotext_output_is_good(text: &str) -> bool { - if text.len() < 500 { - return false; - } - let printable = text.chars().filter(|c| { - c.is_alphanumeric() || c.is_whitespace() || c.is_ascii_punctuation() - }).count(); - let ratio = printable as f64 / text.chars().count() as f64; - ratio > 0.8 -} - -/// Try extracting text from a PDF using pdftotext. -/// Returns Some(text) if pdftotext succeeds and the output looks good, -/// None otherwise. -fn try_pdftotext(pdf_bytes: &[u8]) -> Option { - let tmp_dir = tempfile::tempdir().ok()?; - let pdf_path = tmp_dir.path().join("paper.pdf"); - std::fs::write(&pdf_path, pdf_bytes).ok()?; - - eprintln!("Trying pdftotext…"); - - let output = Command::new("pdftotext") - .arg("-layout") - .arg(&pdf_path) - .arg("-") - .output(); - - match output { - Err(e) if e.kind() == io::ErrorKind::NotFound => { - eprintln!("pdftotext not found on PATH, skipping simple extraction"); - None - } - Err(e) => { - eprintln!("pdftotext failed: {e}"); - None - } - Ok(o) if !o.status.success() => { - eprintln!("pdftotext exited with {}", o.status); - None - } - Ok(o) => { - let text = String::from_utf8_lossy(&o.stdout).into_owned(); - if pdftotext_output_is_good(&text) { - eprintln!("pdftotext output looks good, skipping marker"); - Some(text) - } else { - eprintln!("pdftotext output is low quality, falling back to marker"); - None - } - } - } -} - /// Write PDF bytes to a temp file, run marker_single, and return the /// resulting markdown. fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result { - // Try simple extraction first. - if let Some(text) = try_pdftotext(pdf_bytes) { - return Ok(text); - } - - // Fall back to marker-pdf. let tmp_dir = tempfile::tempdir().context("failed to create temp directory")?; let pdf_path = tmp_dir.path().join("paper.pdf"); let out_dir = tmp_dir.path().join("output"); std::fs::write(&pdf_path, pdf_bytes).context("failed to write temp PDF")?; - eprintln!("Converting PDF to markdown with marker…"); + eprintln!("Converting PDF to markdown…"); let status = Command::new("marker_single") .arg(&pdf_path) @@ -452,25 +342,3 @@ fn find_markdown_file(dir: &std::path::Path) -> anyhow::Result { } bail!("no .md file found in marker output") } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn good_text_passes_quality_check() { - let text = "This is a normal academic paper abstract. ".repeat(20); - assert!(pdftotext_output_is_good(&text)); - } - - #[test] - fn short_text_fails_quality_check() { - assert!(!pdftotext_output_is_good("too short")); - } - - #[test] - fn garbled_text_fails_quality_check() { - let garbled = "\u{fffd}\u{25a0}\u{2588}\u{2591}".repeat(200); - assert!(!pdftotext_output_is_good(&garbled)); - } -}