From 91920d410394560b79f3770d3c9f84dfe5661f3c Mon Sep 17 00:00:00 2001 From: Ellie <6687206+wizzeh@users.noreply.github.com> Date: Wed, 25 Feb 2026 13:35:18 -0800 Subject: [PATCH] docs: update skill docs for unpaywall and pdftotext support Co-Authored-By: Claude Opus 4.6 --- ...26-02-25-unpaywall-and-pdftotext-design.md | 35 +++ ...2026-02-25-unpaywall-and-pdftotext-plan.md | 294 ++++++++++++++++++ skills/paper-reader/SKILL.md | 12 +- 3 files changed, 339 insertions(+), 2 deletions(-) create mode 100644 docs/plans/2026-02-25-unpaywall-and-pdftotext-design.md create mode 100644 docs/plans/2026-02-25-unpaywall-and-pdftotext-plan.md diff --git a/docs/plans/2026-02-25-unpaywall-and-pdftotext-design.md b/docs/plans/2026-02-25-unpaywall-and-pdftotext-design.md new file mode 100644 index 0000000..6178957 --- /dev/null +++ b/docs/plans/2026-02-25-unpaywall-and-pdftotext-design.md @@ -0,0 +1,35 @@ +# Unpaywall + pdftotext — Design + +Two improvements to the paper CLI: try free open-access sources before piracy, and try simple PDF text extraction before heavy ML OCR. + +## Download Pipeline + +Priority chain for fetching the PDF: + +1. **Unpaywall** (new) — hit `https://api.unpaywall.org/v2/{doi}?email={email}`, check `best_oa_location.url_for_pdf`, download if present +2. **LibGen** (existing) +3. **Anna's Archive** (existing) + +Unpaywall requires an email address (no API key). Read from `UNPAYWALL_EMAIL` env var. If unset, skip silently — same pattern as `ANNAS_ARCHIVE_KEY`. + +## Conversion Pipeline + +Priority chain for PDF-to-markdown: + +1. **pdftotext** (new) — shell out to `pdftotext -layout -`, check output quality +2. **marker-pdf** (existing) — heavy ML OCR fallback + +### Quality heuristic for pdftotext output + +- Length > 500 characters +- >80% printable ASCII / common unicode (letters, digits, punctuation, whitespace) + +If either check fails, or pdftotext isn't on PATH, fall back to marker. + +## Nix changes + +Add `poppler_utils` to the flake for `pdftotext`. Include in both the dev shell and the wrapped binary PATH. + +## Dependencies + +No new Rust crate dependencies — `reqwest` and `serde_json` already handle the Unpaywall API call. `pdftotext` is an external binary like `marker_single`. diff --git a/docs/plans/2026-02-25-unpaywall-and-pdftotext-plan.md b/docs/plans/2026-02-25-unpaywall-and-pdftotext-plan.md new file mode 100644 index 0000000..1a0a8e4 --- /dev/null +++ b/docs/plans/2026-02-25-unpaywall-and-pdftotext-plan.md @@ -0,0 +1,294 @@ +# Unpaywall + pdftotext Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Try free open-access sources (Unpaywall) before piracy sources, and try simple PDF text extraction (pdftotext) before heavy ML OCR (marker-pdf). + +**Architecture:** Two insertions into existing pipelines. `download_via_unpaywall` slots in as the first download source before LibGen. `try_pdftotext` slots in as the first conversion method before marker. Both follow existing patterns (shell out for pdftotext, HTTP+JSON for Unpaywall). + +**Tech Stack:** Rust (existing), pdftotext from poppler-utils (new external dep), Unpaywall REST API (new) + +--- + +### Task 1: Add pdftotext quality heuristic with test + +**Files:** +- Modify: `src/main.rs` + +**Step 1: Add the quality check function and a test** + +Add this above the `convert_to_markdown` function (around line 295): + +```rust +/// Check if pdftotext output is usable: long enough and mostly readable text. +fn pdftotext_output_is_good(text: &str) -> bool { + if text.len() < 500 { + return false; + } + let printable = text.chars().filter(|c| { + c.is_alphanumeric() || c.is_whitespace() || c.is_ascii_punctuation() + }).count(); + let ratio = printable as f64 / text.chars().count() as f64; + ratio > 0.8 +} +``` + +Add a test module at the bottom of the file: + +```rust +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn good_text_passes_quality_check() { + let text = "This is a normal academic paper abstract. ".repeat(20); + assert!(pdftotext_output_is_good(&text)); + } + + #[test] + fn short_text_fails_quality_check() { + assert!(!pdftotext_output_is_good("too short")); + } + + #[test] + fn garbled_text_fails_quality_check() { + let garbled = "\u{fffd}\u{25a0}\u{2588}\u{2591}".repeat(200); + assert!(!pdftotext_output_is_good(&garbled)); + } +} +``` + +**Step 2: Run tests** + +Run: `cargo test` +Expected: all 3 tests pass. + +**Step 3: Commit** + +``` +feat: add pdftotext quality heuristic with tests +``` + +--- + +### Task 2: Add try_pdftotext conversion path + +**Files:** +- Modify: `src/main.rs` + +**Step 1: Add the try_pdftotext function** + +Add this above `convert_to_markdown`: + +```rust +/// Try extracting text from a PDF using pdftotext. +/// Returns Some(text) if pdftotext succeeds and the output looks good, +/// None otherwise. +fn try_pdftotext(pdf_bytes: &[u8]) -> Option { + let tmp_dir = tempfile::tempdir().ok()?; + let pdf_path = tmp_dir.path().join("paper.pdf"); + std::fs::write(&pdf_path, pdf_bytes).ok()?; + + eprintln!("Trying pdftotext…"); + + let output = Command::new("pdftotext") + .arg("-layout") + .arg(&pdf_path) + .arg("-") + .output(); + + match output { + Err(e) if e.kind() == io::ErrorKind::NotFound => { + eprintln!("pdftotext not found on PATH, skipping simple extraction"); + None + } + Err(e) => { + eprintln!("pdftotext failed: {e}"); + None + } + Ok(o) if !o.status.success() => { + eprintln!("pdftotext exited with {}", o.status); + None + } + Ok(o) => { + let text = String::from_utf8_lossy(&o.stdout).into_owned(); + if pdftotext_output_is_good(&text) { + eprintln!("pdftotext output looks good, skipping marker"); + Some(text) + } else { + eprintln!("pdftotext output is low quality, falling back to marker"); + None + } + } + } +} +``` + +**Step 2: Wire it into convert_to_markdown** + +Replace the beginning of `convert_to_markdown` (the function signature stays the same). The new body: + +```rust +fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result { + // Try simple extraction first. + if let Some(text) = try_pdftotext(pdf_bytes) { + return Ok(text); + } + + // Fall back to marker-pdf. + let tmp_dir = tempfile::tempdir().context("failed to create temp directory")?; + // ... rest of existing marker logic unchanged ... +``` + +Keep everything from `let pdf_path = tmp_dir.path().join("paper.pdf");` onward unchanged — just move the `tempfile::tempdir()` call below the pdftotext check so we don't create it unnecessarily. + +**Step 3: Build and verify** + +Run: `cargo build` +Expected: compiles cleanly. + +**Step 4: Commit** + +``` +feat: try pdftotext before marker-pdf for conversion +``` + +--- + +### Task 3: Add download_via_unpaywall + +**Files:** +- Modify: `src/main.rs` + +**Step 1: Add the Unpaywall download function** + +Add a new section after the comment `// -- Helpers` block (before `validate_pdf`), or better, add a new section comment and function after the Anna's Archive section (after line 277): + +```rust +// -- Unpaywall --------------------------------------------------------------- + +/// Try downloading an open-access PDF via the Unpaywall API. +fn download_via_unpaywall( + client: &reqwest::blocking::Client, + doi: &str, +) -> anyhow::Result> { + let email = std::env::var("UNPAYWALL_EMAIL") + .context("UNPAYWALL_EMAIL not set")?; + + eprintln!("Checking Unpaywall for open-access PDF…"); + + let api_url = format!( + "https://api.unpaywall.org/v2/{doi}?email={email}" + ); + + let resp: serde_json::Value = client + .get(&api_url) + .send() + .context("failed to call Unpaywall API")? + .json() + .context("failed to parse Unpaywall API response")?; + + let pdf_url = resp + .get("best_oa_location") + .and_then(|loc| loc.get("url_for_pdf")) + .and_then(|u| u.as_str()) + .context("no open-access PDF available via Unpaywall")?; + + eprintln!("Downloading open-access PDF from {pdf_url}"); + let bytes = client + .get(pdf_url) + .send() + .context("failed to download from Unpaywall PDF URL")? + .bytes()?; + + validate_pdf(&bytes)?; + Ok(bytes.to_vec()) +} +``` + +**Step 2: Wire it into download_pdf as the first source** + +In `download_pdf`, add the Unpaywall attempt before the LibGen attempt: + +```rust +fn download_pdf(doi: &str) -> anyhow::Result> { + let client = http_client()?; + + // Try Unpaywall first (free open-access). + match download_via_unpaywall(&client, doi) { + Ok(bytes) => return Ok(bytes), + Err(e) => eprintln!("Unpaywall: {e:#}"), + } + + // Try LibGen. + match download_via_libgen(&client, doi) { + // ... rest unchanged +``` + +**Step 3: Build and verify** + +Run: `cargo build` +Expected: compiles cleanly. + +**Step 4: Commit** + +``` +feat: try Unpaywall for open-access PDFs before LibGen +``` + +--- + +### Task 4: Add poppler-utils to flake.nix + +**Files:** +- Modify: `flake.nix` + +**Step 1: Add poppler_utils to the wrapped binary PATH** + +In the `paper-wrapped` definition, add `pkgs.poppler_utils` to the `makeBinPath` list: + +```nix +--prefix PATH : ${pkgs.lib.makeBinPath [ marker.markerEnv pkgs.poppler_utils ]} \ +``` + +**Step 2: Add poppler_utils to the devShell** + +```nix +devShells.default = pkgs.mkShell { + buildInputs = [ + rust-nightly + marker.markerEnv + pkgs.poppler_utils + ]; +}; +``` + +**Step 3: Verify the flake evaluates** + +Run: `nix flake check` +Expected: no errors. + +**Step 4: Commit** + +``` +feat: add poppler-utils (pdftotext) to nix flake +``` + +--- + +### Task 5: Update skill and design docs + +**Files:** +- Modify: `skills/paper-reader/SKILL.md` — mention that simple PDFs are extracted directly, heavy OCR only used when needed +- Modify: `docs/plans/2026-02-25-unpaywall-and-pdftotext-design.md` — mark as implemented + +**Step 1: Update SKILL.md** + +Add a note in the "How it works" section that the tool tries Unpaywall for open-access papers first, and uses simple text extraction before falling back to ML OCR. Mention `UNPAYWALL_EMAIL` env var. + +**Step 2: Commit** + +``` +docs: update skill docs for unpaywall and pdftotext support +``` diff --git a/skills/paper-reader/SKILL.md b/skills/paper-reader/SKILL.md index 126deec..d20c63b 100644 --- a/skills/paper-reader/SKILL.md +++ b/skills/paper-reader/SKILL.md @@ -5,7 +5,7 @@ description: Fetch and read academic papers by DOI. Use when (1) the user mentio # Paper Reader -Fetch academic papers by DOI using the `paper` CLI, which downloads PDFs and converts them to markdown via `marker_single`. +Fetch academic papers by DOI using the `paper` CLI, which downloads PDFs and converts them to markdown. Uses simple text extraction (`pdftotext`) when possible, falling back to ML OCR (`marker_single`) for scanned or image-heavy papers. ## Usage @@ -32,7 +32,15 @@ Results are cached at `~/.cache/paper/.md`. Subsequent requests for the sam ## Download Sources -The tool tries LibGen first (free, no authentication), then falls back to Anna's Archive fast download API if `ANNAS_ARCHIVE_KEY` is set. +The tool tries sources in this order: + +1. **Unpaywall** — free open-access PDFs (requires `UNPAYWALL_EMAIL` env var) +2. **LibGen** — free, no authentication +3. **Anna's Archive** — fast download API (requires `ANNAS_ARCHIVE_KEY` env var) + +## Conversion + +PDF-to-markdown conversion tries simple text extraction first (`pdftotext`), which works well for most modern papers with proper text layers. If the output is low quality (garbled or too short), it falls back to ML OCR via `marker_single`. ## Errors