Add paper CLI: download academic papers by DOI and convert to markdown

Downloads PDFs from LibGen (primary) or Anna's Archive API (fallback),
converts to markdown via marker_single, and prints to stdout. Includes
XDG-compliant caching, nix flake with marker-pdf packaging, and a
Claude Code skill for paper-reader integration.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Ellie 2026-02-19 22:54:30 -08:00
commit f82b738db7
10 changed files with 2860 additions and 0 deletions

1
.envrc Normal file
View file

@ -0,0 +1 @@
use flake .

3
.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
/target
/result
.direnv/

2137
Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

12
Cargo.toml Normal file
View file

@ -0,0 +1,12 @@
[package]
name = "paper"
version = "0.1.0"
edition = "2024"
[dependencies]
anyhow = "1"
clap = { version = "4", features = ["derive"] }
reqwest = { version = "0.12", features = ["blocking", "rustls-tls", "json"], default-features = false }
scraper = "0.22"
serde_json = "1"
tempfile = "3"

View file

@ -0,0 +1,49 @@
# paper CLI — Design
A CLI tool that downloads academic papers by DOI from Anna's Archive and converts them to markdown.
## CLI Interface
```
paper <DOI>
```
Single positional argument. Markdown output goes to stdout.
```
paper 10.1038/nature12373 > paper.md
```
## Download Flow
1. Request `https://annas-archive.org/scidb/<DOI>` with a browser-like User-Agent
2. Parse HTML for `<iframe>` or `<embed>` with `id="pdf"` — extract `src` for direct PDF URL
3. Fallback: find any link ending in `.pdf`
4. Download PDF to a temp file
5. Exit with clear error if no PDF found
## Conversion
1. Shell out to `marker_single <tempfile.pdf> --output_dir <tempdir>`
2. Read the generated `.md` file from the output dir
3. Print to stdout
4. Clean up temp dir
## Error Handling
- `marker_single` not on PATH: tell user to install (`pip install marker-pdf`)
- Conversion failure: forward marker's stderr
- Network errors: surface reqwest errors clearly
- No PDF found: specific error message with the DOI
## Dependencies
- `clap` — argument parsing
- `reqwest` (blocking, rustls-tls) — HTTP
- `scraper` — HTML parsing
- `tempfile` — temp directory
- `anyhow` — error handling
## Dev Environment
The nix flake includes Rust nightly toolchain and marker-pdf in the devshell.

82
flake.lock generated Normal file
View file

@ -0,0 +1,82 @@
{
"nodes": {
"flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1731533236,
"narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1771207753,
"narHash": "sha256-b9uG8yN50DRQ6A7JdZBfzq718ryYrlmGgqkRm9OOwCE=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "d1c15b7d5806069da59e819999d70e1cec0760bf",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixpkgs-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs",
"rust-overlay": "rust-overlay"
}
},
"rust-overlay": {
"inputs": {
"nixpkgs": [
"nixpkgs"
]
},
"locked": {
"lastModified": 1771470520,
"narHash": "sha256-PvytHcaYN5cPUll7FB70mXv1rRsIBRmu47fFfq3haxA=",
"owner": "oxalica",
"repo": "rust-overlay",
"rev": "a1d4cc1f264c45d3745af0d2ca5e59d460e58777",
"type": "github"
},
"original": {
"owner": "oxalica",
"repo": "rust-overlay",
"type": "github"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

55
flake.nix Normal file
View file

@ -0,0 +1,55 @@
{
description = "paper download papers by DOI and convert to markdown";
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
flake-utils.url = "github:numtide/flake-utils";
rust-overlay = {
url = "github:oxalica/rust-overlay";
inputs.nixpkgs.follows = "nixpkgs";
};
};
outputs = { self, nixpkgs, flake-utils, rust-overlay }:
flake-utils.lib.eachDefaultSystem (system:
let
overlays = [ (import rust-overlay) ];
pkgs = import nixpkgs { inherit system overlays; };
rust-nightly = pkgs.rust-bin.nightly.latest.default.override {
extensions = [ "rust-src" "rust-analyzer" ];
};
marker = import ./nix/marker.nix { inherit pkgs; };
paper = pkgs.rustPlatform.buildRustPackage {
pname = "paper";
version = "0.1.0";
src = pkgs.lib.cleanSource ./.;
cargoLock.lockFile = ./Cargo.lock;
};
# Wrap the paper binary so marker_single is on PATH
paper-wrapped = pkgs.symlinkJoin {
name = "paper-${paper.version}";
paths = [ paper ];
nativeBuildInputs = [ pkgs.makeWrapper ];
postBuild = ''
wrapProgram $out/bin/paper \
--prefix PATH : ${pkgs.lib.makeBinPath [ marker.markerEnv ]}
'';
};
in
{
packages = {
default = paper-wrapped;
unwrapped = paper;
};
devShells.default = pkgs.mkShell {
buildInputs = [
rust-nightly
marker.markerEnv
];
};
});
}

136
nix/marker.nix Normal file
View file

@ -0,0 +1,136 @@
# Nix expressions for marker-pdf and its missing dependencies.
{ pkgs }:
let
python3Packages = pkgs.python3Packages;
# pypdfium2 4.30.0 — pinned because pdftext and surya-ocr require v4.x API.
# Installed from the manylinux wheel which bundles libpdfium.
pypdfium2 = python3Packages.buildPythonPackage rec {
pname = "pypdfium2";
version = "4.30.0";
format = "wheel";
src = pkgs.fetchurl {
url = "https://files.pythonhosted.org/packages/65/cd/3f1edf20a0ef4a212a5e20a5900e64942c5a374473671ac0780eaa08ea80/pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl";
hash = "sha256-8feNIYng3fmsK3qbm9Twxm9U0Tif9sF+n9ncA00G6z8=";
};
nativeBuildInputs = [ pkgs.autoPatchelfHook ];
buildInputs = [ pkgs.stdenv.cc.cc.lib ];
pythonImportsCheck = [ "pypdfium2" ];
};
pdftext = python3Packages.buildPythonPackage rec {
pname = "pdftext";
version = "0.6.3";
pyproject = true;
src = pkgs.fetchPypi {
inherit pname version;
hash = "sha256-q1xd/g8ft43h24N8ytrB6kGwfOGJD+rZc8moTNr1Tew=";
};
build-system = [ python3Packages.poetry-core ];
nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
pythonRelaxDeps = true;
dependencies = [
pypdfium2
python3Packages.pydantic
python3Packages.pydantic-settings
python3Packages.click
];
# Tests require PDF fixtures not included in the sdist
doCheck = false;
pythonImportsCheck = [ "pdftext" ];
};
surya-ocr = python3Packages.buildPythonPackage rec {
pname = "surya-ocr";
version = "0.17.1";
pyproject = true;
src = pkgs.fetchPypi {
pname = "surya_ocr";
inherit version;
hash = "sha256-NJ142FTB7V+Bblg1Re1kUaoLxpkuKDqAUDR5mqzujCQ=";
};
build-system = [ python3Packages.poetry-core ];
nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
pythonRelaxDeps = true;
pythonRemoveDeps = [ "pre-commit" ];
dependencies = [
python3Packages.transformers
python3Packages.torch
python3Packages.pydantic
python3Packages.pydantic-settings
python3Packages.python-dotenv
python3Packages.pillow
pypdfium2
python3Packages.filetype
python3Packages.click
python3Packages.platformdirs
python3Packages.opencv-python-headless
python3Packages.einops
];
# Tests require model weights and GPU
doCheck = false;
pythonImportsCheck = [ "surya" ];
};
marker-pdf = python3Packages.buildPythonPackage rec {
pname = "marker-pdf";
version = "1.10.2";
pyproject = true;
src = pkgs.fetchPypi {
pname = "marker_pdf";
inherit version;
hash = "sha256-zg/IOeEa11GaV20lTKnVGg+UVLnX2gIhH3IrFBMX+fE=";
};
build-system = [ python3Packages.poetry-core ];
nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
pythonRelaxDeps = true;
pythonRemoveDeps = [ "pre-commit" ];
dependencies = [
python3Packages.pillow
python3Packages.pydantic
python3Packages.pydantic-settings
python3Packages.transformers
python3Packages.python-dotenv
python3Packages.torch
python3Packages.tqdm
python3Packages.ftfy
python3Packages.rapidfuzz
surya-ocr
python3Packages.regex
pdftext
python3Packages.markdownify
python3Packages.click
python3Packages.markdown2
python3Packages.filetype
python3Packages.google-genai
python3Packages.anthropic
python3Packages.scikit-learn
python3Packages.openai
];
# Tests require model weights
doCheck = false;
pythonImportsCheck = [ "marker" ];
};
in
{
inherit pypdfium2 pdftext surya-ocr marker-pdf;
# Python environment with marker_single on PATH
markerEnv = python3Packages.python.withPackages (_: [ marker-pdf ]);
}

41
skill/SKILL.md Normal file
View file

@ -0,0 +1,41 @@
---
name: paper-reader
description: Fetch and read academic papers by DOI. Use when (1) the user mentions a DOI (e.g., 10.1038/nature12373), asks to read/summarize/analyze a research paper, or references a paper they want to work with, or (2) Claude needs to consult a specific paper as part of research — e.g., a web search returns a relevant DOI, or a cited paper would help answer the user's question. Converts PDFs to markdown so the paper content can be read and discussed.
---
# Paper Reader
Fetch academic papers by DOI using the `paper` CLI, which downloads PDFs and converts them to markdown via `marker_single`.
## Usage
```bash
# Fetch a paper and save to a temp file, then read it
paper <DOI> > /tmp/paper.md
# Bypass cache to re-download
paper --no-cache <DOI> > /tmp/paper.md
```
Output goes to stdout (markdown). Progress/status goes to stderr. DOIs can be passed with or without the `https://doi.org/` prefix.
## Workflow
1. Extract the DOI from the user's message (look for patterns like `10.xxxx/...`)
2. Run `paper <DOI> > /tmp/paper-<sanitized-doi>.md` via Bash
3. Read the resulting markdown file
4. Respond to what the user asked (summarize, explain, answer questions, etc.)
## Caching
Results are cached at `~/.cache/paper/<DOI>.md`. Subsequent requests for the same DOI return instantly. Use `--no-cache` only when the user explicitly wants a fresh conversion.
## Download Sources
The tool tries LibGen first (free, no authentication), then falls back to Anna's Archive fast download API if `ANNAS_ARCHIVE_KEY` is set.
## Errors
- **"marker_single not found"**: The `marker_single` Python tool is not installed. Run `pip install marker-pdf` or use the nix devshell in `~/proj/paper`.
- **"no results found on LibGen"**: The DOI may not be in LibGen's collection. Verify the DOI is correct.
- **"all download sources failed"**: Neither LibGen nor Anna's Archive had the paper. The user may need to find it manually.

344
src/main.rs Normal file
View file

@ -0,0 +1,344 @@
use std::io::{self, Read};
use std::path::PathBuf;
use std::process::Command;
use anyhow::{Context, bail};
use clap::Parser;
use scraper::{Html, Selector};
const USER_AGENT: &str =
"Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0";
const LIBGEN_BASE: &str = "https://libgen.li";
#[derive(Parser)]
#[command(about = "Download a paper by DOI and convert it to markdown")]
struct Args {
/// The DOI of the paper to download
doi: String,
/// Skip the cache and re-download/re-convert
#[arg(long)]
no_cache: bool,
}
fn main() -> anyhow::Result<()> {
let args = Args::parse();
let doi = args.doi.trim_start_matches("https://doi.org/");
if !args.no_cache {
if let Some(cached) = read_cache(doi) {
print!("{cached}");
return Ok(());
}
}
let pdf_bytes = download_pdf(doi)?;
let markdown = convert_to_markdown(&pdf_bytes)?;
write_cache(doi, &markdown);
print!("{markdown}");
Ok(())
}
// ---------------------------------------------------------------------------
// Cache
// ---------------------------------------------------------------------------
/// Return the cache directory: `$XDG_CACHE_HOME/paper` or `~/.cache/paper`.
fn cache_dir() -> Option<PathBuf> {
let base = std::env::var_os("XDG_CACHE_HOME")
.map(PathBuf::from)
.or_else(|| std::env::var_os("HOME").map(|h| PathBuf::from(h).join(".cache")))?;
Some(base.join("paper"))
}
/// Path to a cached markdown file for a given DOI.
/// DOI `10.1038/nature12373` maps to `<cache>/10.1038/nature12373.md`.
fn cache_path(doi: &str) -> Option<PathBuf> {
cache_dir().map(|d| d.join(format!("{doi}.md")))
}
fn read_cache(doi: &str) -> Option<String> {
let path = cache_path(doi)?;
match std::fs::read_to_string(&path) {
Ok(content) => {
eprintln!("Using cached result from {}", path.display());
Some(content)
}
Err(_) => None,
}
}
fn write_cache(doi: &str, markdown: &str) {
let Some(path) = cache_path(doi) else { return };
if let Some(parent) = path.parent() {
let _ = std::fs::create_dir_all(parent);
}
let _ = std::fs::write(&path, markdown);
}
// ---------------------------------------------------------------------------
// Download
// ---------------------------------------------------------------------------
fn http_client() -> anyhow::Result<reqwest::blocking::Client> {
Ok(reqwest::blocking::Client::builder()
.user_agent(USER_AGENT)
.build()?)
}
/// Download a paper PDF by DOI.
///
/// Tries LibGen first (free, no JS challenge). If that fails and an Anna's
/// Archive API key is configured via `ANNAS_ARCHIVE_KEY`, tries the fast
/// download API as a fallback.
fn download_pdf(doi: &str) -> anyhow::Result<Vec<u8>> {
let client = http_client()?;
// Try LibGen first.
match download_via_libgen(&client, doi) {
Ok(bytes) => return Ok(bytes),
Err(e) => eprintln!("LibGen failed: {e:#}"),
}
// Try Anna's Archive fast download API if a key is available.
// This requires an MD5 — attempt to resolve one from LibGen even if the
// download itself failed (the search may have worked).
if let Ok(key) = std::env::var("ANNAS_ARCHIVE_KEY") {
if let Some(md5) = resolve_md5_from_libgen(&client, doi) {
match download_via_annas_archive(&client, &md5, &key) {
Ok(bytes) => return Ok(bytes),
Err(e) => eprintln!("Anna's Archive API failed: {e:#}"),
}
}
}
bail!("all download sources failed for DOI {doi}")
}
// -- LibGen -----------------------------------------------------------------
/// Resolve a DOI to a paper MD5 via LibGen search + edition page.
fn resolve_md5_from_libgen(
client: &reqwest::blocking::Client,
doi: &str,
) -> Option<String> {
let edition_id = libgen_search(client, doi).ok()?;
libgen_edition_md5(client, &edition_id).ok()
}
/// Download a paper PDF from LibGen by DOI.
fn download_via_libgen(
client: &reqwest::blocking::Client,
doi: &str,
) -> anyhow::Result<Vec<u8>> {
eprintln!("Searching LibGen for DOI {doi}");
let edition_id = libgen_search(client, doi)?;
eprintln!("Found edition {edition_id}, resolving download link…");
let md5 = libgen_edition_md5(client, &edition_id)?;
let download_key = libgen_download_key(client, &md5)?;
let download_url = format!("{LIBGEN_BASE}/get.php?md5={md5}&key={download_key}");
eprintln!("Downloading PDF…");
let bytes = client
.get(&download_url)
.send()
.context("failed to request PDF from LibGen")?
.bytes()
.context("failed to read PDF body")?;
validate_pdf(&bytes)?;
Ok(bytes.to_vec())
}
/// Search LibGen by DOI and return the first matching edition ID.
fn libgen_search(
client: &reqwest::blocking::Client,
doi: &str,
) -> anyhow::Result<String> {
let url = format!("{LIBGEN_BASE}/index.php?req={doi}&topics%5B%5D=a&res=25");
let html = client
.get(&url)
.send()
.context("failed to search LibGen")?
.text()?;
let doc = Html::parse_document(&html);
let sel =
Selector::parse("a[href*='edition.php?id=']").expect("valid selector");
for el in doc.select(&sel) {
if let Some(href) = el.value().attr("href") {
if let Some(id) = href.strip_prefix("edition.php?id=") {
return Ok(id.to_string());
}
}
}
bail!("no results found on LibGen for DOI {doi}")
}
/// Fetch a LibGen edition page and extract the file's MD5.
fn libgen_edition_md5(
client: &reqwest::blocking::Client,
edition_id: &str,
) -> anyhow::Result<String> {
let url = format!("{LIBGEN_BASE}/edition.php?id={edition_id}");
let html = client.get(&url).send()?.text()?;
let doc = Html::parse_document(&html);
let sel = Selector::parse("a[href*='ads.php?md5=']").expect("valid selector");
for el in doc.select(&sel) {
if let Some(href) = el.value().attr("href") {
if let Some(rest) = href.strip_prefix("ads.php?md5=") {
// href may have extra params after the md5
let md5 = rest.split('&').next().unwrap_or(rest);
return Ok(md5.to_string());
}
if let Some(rest) = href.strip_prefix("/ads.php?md5=") {
let md5 = rest.split('&').next().unwrap_or(rest);
return Ok(md5.to_string());
}
}
}
bail!("no download link found on edition page {edition_id}")
}
/// Fetch the LibGen ads/download page for an MD5 and extract the one-time
/// download key.
fn libgen_download_key(
client: &reqwest::blocking::Client,
md5: &str,
) -> anyhow::Result<String> {
let url = format!("{LIBGEN_BASE}/ads.php?md5={md5}");
let html = client.get(&url).send()?.text()?;
let doc = Html::parse_document(&html);
let sel = Selector::parse("a[href*='get.php?md5=']").expect("valid selector");
for el in doc.select(&sel) {
if let Some(href) = el.value().attr("href") {
// Extract key= param from the get.php link
if let Some(idx) = href.find("key=") {
let key = &href[idx + 4..];
let key = key.split('&').next().unwrap_or(key);
if !key.is_empty() {
return Ok(key.to_string());
}
}
}
}
bail!("no download key found on LibGen ads page for md5 {md5}")
}
// -- Anna's Archive ---------------------------------------------------------
/// Download a paper PDF via the Anna's Archive fast download JSON API.
fn download_via_annas_archive(
client: &reqwest::blocking::Client,
md5: &str,
key: &str,
) -> anyhow::Result<Vec<u8>> {
eprintln!("Trying Anna's Archive fast download API…");
let api_url = format!(
"https://annas-archive.li/dyn/api/fast_download.json?md5={md5}&key={key}"
);
let resp: serde_json::Value = client
.get(&api_url)
.send()
.context("failed to call Anna's Archive API")?
.json()
.context("failed to parse Anna's Archive API response")?;
if let Some(err) = resp.get("error").and_then(|e| e.as_str()) {
if !err.is_empty() {
bail!("Anna's Archive API error: {err}");
}
}
let download_url = resp
.get("download_url")
.and_then(|u| u.as_str())
.context("no download_url in Anna's Archive API response")?;
eprintln!("Downloading PDF from Anna's Archive…");
let bytes = client
.get(download_url)
.send()
.context("failed to download from Anna's Archive")?
.bytes()?;
validate_pdf(&bytes)?;
Ok(bytes.to_vec())
}
// -- Helpers ----------------------------------------------------------------
fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> {
if bytes.len() < 1024 {
bail!(
"downloaded file is suspiciously small ({} bytes) — may not be a valid PDF",
bytes.len()
);
}
Ok(())
}
// ---------------------------------------------------------------------------
// Conversion
// ---------------------------------------------------------------------------
/// Write PDF bytes to a temp file, run marker_single, and return the
/// resulting markdown.
fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result<String> {
let tmp_dir = tempfile::tempdir().context("failed to create temp directory")?;
let pdf_path = tmp_dir.path().join("paper.pdf");
let out_dir = tmp_dir.path().join("output");
std::fs::write(&pdf_path, pdf_bytes).context("failed to write temp PDF")?;
eprintln!("Converting PDF to markdown…");
let status = Command::new("marker_single")
.arg(&pdf_path)
.arg("--output_dir")
.arg(&out_dir)
.arg("--output_format")
.arg("markdown")
.status();
match status {
Err(e) if e.kind() == io::ErrorKind::NotFound => {
bail!("marker_single not found on PATH. Install it with:\n pip install marker-pdf");
}
Err(e) => bail!("failed to run marker_single: {e}"),
Ok(s) if !s.success() => bail!("marker_single exited with {s}"),
Ok(_) => {}
}
// marker_single creates a subdirectory inside our output dir — find
// the .md file within it.
find_markdown_file(&out_dir)
}
/// Recursively search a directory for the first .md file and read it.
fn find_markdown_file(dir: &std::path::Path) -> anyhow::Result<String> {
for entry in std::fs::read_dir(dir).context("failed to read marker output directory")? {
let entry = entry?;
let path = entry.path();
if path.is_dir() {
if let Ok(md) = find_markdown_file(&path) {
return Ok(md);
}
} else if path.extension().is_some_and(|ext| ext == "md") {
let mut content = String::new();
std::fs::File::open(&path)?.read_to_string(&mut content)?;
return Ok(content);
}
}
bail!("no .md file found in marker output")
}