Add paper CLI: download academic papers by DOI and convert to markdown
Downloads PDFs from LibGen (primary) or Anna's Archive API (fallback), converts to markdown via marker_single, and prints to stdout. Includes XDG-compliant caching, nix flake with marker-pdf packaging, and a Claude Code skill for paper-reader integration. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
commit
f82b738db7
10 changed files with 2860 additions and 0 deletions
1
.envrc
Normal file
1
.envrc
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
use flake .
|
||||||
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
/target
|
||||||
|
/result
|
||||||
|
.direnv/
|
||||||
2137
Cargo.lock
generated
Normal file
2137
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
12
Cargo.toml
Normal file
12
Cargo.toml
Normal file
|
|
@ -0,0 +1,12 @@
|
||||||
|
[package]
|
||||||
|
name = "paper"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2024"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
anyhow = "1"
|
||||||
|
clap = { version = "4", features = ["derive"] }
|
||||||
|
reqwest = { version = "0.12", features = ["blocking", "rustls-tls", "json"], default-features = false }
|
||||||
|
scraper = "0.22"
|
||||||
|
serde_json = "1"
|
||||||
|
tempfile = "3"
|
||||||
49
docs/plans/2026-02-19-paper-cli-design.md
Normal file
49
docs/plans/2026-02-19-paper-cli-design.md
Normal file
|
|
@ -0,0 +1,49 @@
|
||||||
|
# paper CLI — Design
|
||||||
|
|
||||||
|
A CLI tool that downloads academic papers by DOI from Anna's Archive and converts them to markdown.
|
||||||
|
|
||||||
|
## CLI Interface
|
||||||
|
|
||||||
|
```
|
||||||
|
paper <DOI>
|
||||||
|
```
|
||||||
|
|
||||||
|
Single positional argument. Markdown output goes to stdout.
|
||||||
|
|
||||||
|
```
|
||||||
|
paper 10.1038/nature12373 > paper.md
|
||||||
|
```
|
||||||
|
|
||||||
|
## Download Flow
|
||||||
|
|
||||||
|
1. Request `https://annas-archive.org/scidb/<DOI>` with a browser-like User-Agent
|
||||||
|
2. Parse HTML for `<iframe>` or `<embed>` with `id="pdf"` — extract `src` for direct PDF URL
|
||||||
|
3. Fallback: find any link ending in `.pdf`
|
||||||
|
4. Download PDF to a temp file
|
||||||
|
5. Exit with clear error if no PDF found
|
||||||
|
|
||||||
|
## Conversion
|
||||||
|
|
||||||
|
1. Shell out to `marker_single <tempfile.pdf> --output_dir <tempdir>`
|
||||||
|
2. Read the generated `.md` file from the output dir
|
||||||
|
3. Print to stdout
|
||||||
|
4. Clean up temp dir
|
||||||
|
|
||||||
|
## Error Handling
|
||||||
|
|
||||||
|
- `marker_single` not on PATH: tell user to install (`pip install marker-pdf`)
|
||||||
|
- Conversion failure: forward marker's stderr
|
||||||
|
- Network errors: surface reqwest errors clearly
|
||||||
|
- No PDF found: specific error message with the DOI
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
- `clap` — argument parsing
|
||||||
|
- `reqwest` (blocking, rustls-tls) — HTTP
|
||||||
|
- `scraper` — HTML parsing
|
||||||
|
- `tempfile` — temp directory
|
||||||
|
- `anyhow` — error handling
|
||||||
|
|
||||||
|
## Dev Environment
|
||||||
|
|
||||||
|
The nix flake includes Rust nightly toolchain and marker-pdf in the devshell.
|
||||||
82
flake.lock
generated
Normal file
82
flake.lock
generated
Normal file
|
|
@ -0,0 +1,82 @@
|
||||||
|
{
|
||||||
|
"nodes": {
|
||||||
|
"flake-utils": {
|
||||||
|
"inputs": {
|
||||||
|
"systems": "systems"
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1731533236,
|
||||||
|
"narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nixpkgs": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1771207753,
|
||||||
|
"narHash": "sha256-b9uG8yN50DRQ6A7JdZBfzq718ryYrlmGgqkRm9OOwCE=",
|
||||||
|
"owner": "NixOS",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"rev": "d1c15b7d5806069da59e819999d70e1cec0760bf",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "NixOS",
|
||||||
|
"ref": "nixpkgs-unstable",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": {
|
||||||
|
"inputs": {
|
||||||
|
"flake-utils": "flake-utils",
|
||||||
|
"nixpkgs": "nixpkgs",
|
||||||
|
"rust-overlay": "rust-overlay"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"rust-overlay": {
|
||||||
|
"inputs": {
|
||||||
|
"nixpkgs": [
|
||||||
|
"nixpkgs"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1771470520,
|
||||||
|
"narHash": "sha256-PvytHcaYN5cPUll7FB70mXv1rRsIBRmu47fFfq3haxA=",
|
||||||
|
"owner": "oxalica",
|
||||||
|
"repo": "rust-overlay",
|
||||||
|
"rev": "a1d4cc1f264c45d3745af0d2ca5e59d460e58777",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "oxalica",
|
||||||
|
"repo": "rust-overlay",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"systems": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1681028828,
|
||||||
|
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||||
|
"owner": "nix-systems",
|
||||||
|
"repo": "default",
|
||||||
|
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "nix-systems",
|
||||||
|
"repo": "default",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": "root",
|
||||||
|
"version": 7
|
||||||
|
}
|
||||||
55
flake.nix
Normal file
55
flake.nix
Normal file
|
|
@ -0,0 +1,55 @@
|
||||||
|
{
|
||||||
|
description = "paper — download papers by DOI and convert to markdown";
|
||||||
|
|
||||||
|
inputs = {
|
||||||
|
nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
|
||||||
|
flake-utils.url = "github:numtide/flake-utils";
|
||||||
|
rust-overlay = {
|
||||||
|
url = "github:oxalica/rust-overlay";
|
||||||
|
inputs.nixpkgs.follows = "nixpkgs";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
outputs = { self, nixpkgs, flake-utils, rust-overlay }:
|
||||||
|
flake-utils.lib.eachDefaultSystem (system:
|
||||||
|
let
|
||||||
|
overlays = [ (import rust-overlay) ];
|
||||||
|
pkgs = import nixpkgs { inherit system overlays; };
|
||||||
|
rust-nightly = pkgs.rust-bin.nightly.latest.default.override {
|
||||||
|
extensions = [ "rust-src" "rust-analyzer" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
marker = import ./nix/marker.nix { inherit pkgs; };
|
||||||
|
|
||||||
|
paper = pkgs.rustPlatform.buildRustPackage {
|
||||||
|
pname = "paper";
|
||||||
|
version = "0.1.0";
|
||||||
|
src = pkgs.lib.cleanSource ./.;
|
||||||
|
cargoLock.lockFile = ./Cargo.lock;
|
||||||
|
};
|
||||||
|
|
||||||
|
# Wrap the paper binary so marker_single is on PATH
|
||||||
|
paper-wrapped = pkgs.symlinkJoin {
|
||||||
|
name = "paper-${paper.version}";
|
||||||
|
paths = [ paper ];
|
||||||
|
nativeBuildInputs = [ pkgs.makeWrapper ];
|
||||||
|
postBuild = ''
|
||||||
|
wrapProgram $out/bin/paper \
|
||||||
|
--prefix PATH : ${pkgs.lib.makeBinPath [ marker.markerEnv ]}
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
in
|
||||||
|
{
|
||||||
|
packages = {
|
||||||
|
default = paper-wrapped;
|
||||||
|
unwrapped = paper;
|
||||||
|
};
|
||||||
|
|
||||||
|
devShells.default = pkgs.mkShell {
|
||||||
|
buildInputs = [
|
||||||
|
rust-nightly
|
||||||
|
marker.markerEnv
|
||||||
|
];
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
136
nix/marker.nix
Normal file
136
nix/marker.nix
Normal file
|
|
@ -0,0 +1,136 @@
|
||||||
|
# Nix expressions for marker-pdf and its missing dependencies.
|
||||||
|
{ pkgs }:
|
||||||
|
|
||||||
|
let
|
||||||
|
python3Packages = pkgs.python3Packages;
|
||||||
|
|
||||||
|
# pypdfium2 4.30.0 — pinned because pdftext and surya-ocr require v4.x API.
|
||||||
|
# Installed from the manylinux wheel which bundles libpdfium.
|
||||||
|
pypdfium2 = python3Packages.buildPythonPackage rec {
|
||||||
|
pname = "pypdfium2";
|
||||||
|
version = "4.30.0";
|
||||||
|
format = "wheel";
|
||||||
|
|
||||||
|
src = pkgs.fetchurl {
|
||||||
|
url = "https://files.pythonhosted.org/packages/65/cd/3f1edf20a0ef4a212a5e20a5900e64942c5a374473671ac0780eaa08ea80/pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl";
|
||||||
|
hash = "sha256-8feNIYng3fmsK3qbm9Twxm9U0Tif9sF+n9ncA00G6z8=";
|
||||||
|
};
|
||||||
|
|
||||||
|
nativeBuildInputs = [ pkgs.autoPatchelfHook ];
|
||||||
|
buildInputs = [ pkgs.stdenv.cc.cc.lib ];
|
||||||
|
|
||||||
|
pythonImportsCheck = [ "pypdfium2" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
pdftext = python3Packages.buildPythonPackage rec {
|
||||||
|
pname = "pdftext";
|
||||||
|
version = "0.6.3";
|
||||||
|
pyproject = true;
|
||||||
|
|
||||||
|
src = pkgs.fetchPypi {
|
||||||
|
inherit pname version;
|
||||||
|
hash = "sha256-q1xd/g8ft43h24N8ytrB6kGwfOGJD+rZc8moTNr1Tew=";
|
||||||
|
};
|
||||||
|
|
||||||
|
build-system = [ python3Packages.poetry-core ];
|
||||||
|
nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
|
||||||
|
pythonRelaxDeps = true;
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
pypdfium2
|
||||||
|
python3Packages.pydantic
|
||||||
|
python3Packages.pydantic-settings
|
||||||
|
python3Packages.click
|
||||||
|
];
|
||||||
|
|
||||||
|
# Tests require PDF fixtures not included in the sdist
|
||||||
|
doCheck = false;
|
||||||
|
pythonImportsCheck = [ "pdftext" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
surya-ocr = python3Packages.buildPythonPackage rec {
|
||||||
|
pname = "surya-ocr";
|
||||||
|
version = "0.17.1";
|
||||||
|
pyproject = true;
|
||||||
|
|
||||||
|
src = pkgs.fetchPypi {
|
||||||
|
pname = "surya_ocr";
|
||||||
|
inherit version;
|
||||||
|
hash = "sha256-NJ142FTB7V+Bblg1Re1kUaoLxpkuKDqAUDR5mqzujCQ=";
|
||||||
|
};
|
||||||
|
|
||||||
|
build-system = [ python3Packages.poetry-core ];
|
||||||
|
nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
|
||||||
|
pythonRelaxDeps = true;
|
||||||
|
pythonRemoveDeps = [ "pre-commit" ];
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
python3Packages.transformers
|
||||||
|
python3Packages.torch
|
||||||
|
python3Packages.pydantic
|
||||||
|
python3Packages.pydantic-settings
|
||||||
|
python3Packages.python-dotenv
|
||||||
|
python3Packages.pillow
|
||||||
|
pypdfium2
|
||||||
|
python3Packages.filetype
|
||||||
|
python3Packages.click
|
||||||
|
python3Packages.platformdirs
|
||||||
|
python3Packages.opencv-python-headless
|
||||||
|
python3Packages.einops
|
||||||
|
];
|
||||||
|
|
||||||
|
# Tests require model weights and GPU
|
||||||
|
doCheck = false;
|
||||||
|
pythonImportsCheck = [ "surya" ];
|
||||||
|
};
|
||||||
|
|
||||||
|
marker-pdf = python3Packages.buildPythonPackage rec {
|
||||||
|
pname = "marker-pdf";
|
||||||
|
version = "1.10.2";
|
||||||
|
pyproject = true;
|
||||||
|
|
||||||
|
src = pkgs.fetchPypi {
|
||||||
|
pname = "marker_pdf";
|
||||||
|
inherit version;
|
||||||
|
hash = "sha256-zg/IOeEa11GaV20lTKnVGg+UVLnX2gIhH3IrFBMX+fE=";
|
||||||
|
};
|
||||||
|
|
||||||
|
build-system = [ python3Packages.poetry-core ];
|
||||||
|
nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
|
||||||
|
pythonRelaxDeps = true;
|
||||||
|
pythonRemoveDeps = [ "pre-commit" ];
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
python3Packages.pillow
|
||||||
|
python3Packages.pydantic
|
||||||
|
python3Packages.pydantic-settings
|
||||||
|
python3Packages.transformers
|
||||||
|
python3Packages.python-dotenv
|
||||||
|
python3Packages.torch
|
||||||
|
python3Packages.tqdm
|
||||||
|
python3Packages.ftfy
|
||||||
|
python3Packages.rapidfuzz
|
||||||
|
surya-ocr
|
||||||
|
python3Packages.regex
|
||||||
|
pdftext
|
||||||
|
python3Packages.markdownify
|
||||||
|
python3Packages.click
|
||||||
|
python3Packages.markdown2
|
||||||
|
python3Packages.filetype
|
||||||
|
python3Packages.google-genai
|
||||||
|
python3Packages.anthropic
|
||||||
|
python3Packages.scikit-learn
|
||||||
|
python3Packages.openai
|
||||||
|
];
|
||||||
|
|
||||||
|
# Tests require model weights
|
||||||
|
doCheck = false;
|
||||||
|
pythonImportsCheck = [ "marker" ];
|
||||||
|
};
|
||||||
|
in
|
||||||
|
{
|
||||||
|
inherit pypdfium2 pdftext surya-ocr marker-pdf;
|
||||||
|
|
||||||
|
# Python environment with marker_single on PATH
|
||||||
|
markerEnv = python3Packages.python.withPackages (_: [ marker-pdf ]);
|
||||||
|
}
|
||||||
41
skill/SKILL.md
Normal file
41
skill/SKILL.md
Normal file
|
|
@ -0,0 +1,41 @@
|
||||||
|
---
|
||||||
|
name: paper-reader
|
||||||
|
description: Fetch and read academic papers by DOI. Use when (1) the user mentions a DOI (e.g., 10.1038/nature12373), asks to read/summarize/analyze a research paper, or references a paper they want to work with, or (2) Claude needs to consult a specific paper as part of research — e.g., a web search returns a relevant DOI, or a cited paper would help answer the user's question. Converts PDFs to markdown so the paper content can be read and discussed.
|
||||||
|
---
|
||||||
|
|
||||||
|
# Paper Reader
|
||||||
|
|
||||||
|
Fetch academic papers by DOI using the `paper` CLI, which downloads PDFs and converts them to markdown via `marker_single`.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Fetch a paper and save to a temp file, then read it
|
||||||
|
paper <DOI> > /tmp/paper.md
|
||||||
|
|
||||||
|
# Bypass cache to re-download
|
||||||
|
paper --no-cache <DOI> > /tmp/paper.md
|
||||||
|
```
|
||||||
|
|
||||||
|
Output goes to stdout (markdown). Progress/status goes to stderr. DOIs can be passed with or without the `https://doi.org/` prefix.
|
||||||
|
|
||||||
|
## Workflow
|
||||||
|
|
||||||
|
1. Extract the DOI from the user's message (look for patterns like `10.xxxx/...`)
|
||||||
|
2. Run `paper <DOI> > /tmp/paper-<sanitized-doi>.md` via Bash
|
||||||
|
3. Read the resulting markdown file
|
||||||
|
4. Respond to what the user asked (summarize, explain, answer questions, etc.)
|
||||||
|
|
||||||
|
## Caching
|
||||||
|
|
||||||
|
Results are cached at `~/.cache/paper/<DOI>.md`. Subsequent requests for the same DOI return instantly. Use `--no-cache` only when the user explicitly wants a fresh conversion.
|
||||||
|
|
||||||
|
## Download Sources
|
||||||
|
|
||||||
|
The tool tries LibGen first (free, no authentication), then falls back to Anna's Archive fast download API if `ANNAS_ARCHIVE_KEY` is set.
|
||||||
|
|
||||||
|
## Errors
|
||||||
|
|
||||||
|
- **"marker_single not found"**: The `marker_single` Python tool is not installed. Run `pip install marker-pdf` or use the nix devshell in `~/proj/paper`.
|
||||||
|
- **"no results found on LibGen"**: The DOI may not be in LibGen's collection. Verify the DOI is correct.
|
||||||
|
- **"all download sources failed"**: Neither LibGen nor Anna's Archive had the paper. The user may need to find it manually.
|
||||||
344
src/main.rs
Normal file
344
src/main.rs
Normal file
|
|
@ -0,0 +1,344 @@
|
||||||
|
use std::io::{self, Read};
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::process::Command;
|
||||||
|
|
||||||
|
use anyhow::{Context, bail};
|
||||||
|
use clap::Parser;
|
||||||
|
use scraper::{Html, Selector};
|
||||||
|
|
||||||
|
const USER_AGENT: &str =
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0";
|
||||||
|
const LIBGEN_BASE: &str = "https://libgen.li";
|
||||||
|
|
||||||
|
#[derive(Parser)]
|
||||||
|
#[command(about = "Download a paper by DOI and convert it to markdown")]
|
||||||
|
struct Args {
|
||||||
|
/// The DOI of the paper to download
|
||||||
|
doi: String,
|
||||||
|
|
||||||
|
/// Skip the cache and re-download/re-convert
|
||||||
|
#[arg(long)]
|
||||||
|
no_cache: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() -> anyhow::Result<()> {
|
||||||
|
let args = Args::parse();
|
||||||
|
let doi = args.doi.trim_start_matches("https://doi.org/");
|
||||||
|
|
||||||
|
if !args.no_cache {
|
||||||
|
if let Some(cached) = read_cache(doi) {
|
||||||
|
print!("{cached}");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let pdf_bytes = download_pdf(doi)?;
|
||||||
|
let markdown = convert_to_markdown(&pdf_bytes)?;
|
||||||
|
|
||||||
|
write_cache(doi, &markdown);
|
||||||
|
print!("{markdown}");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Cache
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Return the cache directory: `$XDG_CACHE_HOME/paper` or `~/.cache/paper`.
|
||||||
|
fn cache_dir() -> Option<PathBuf> {
|
||||||
|
let base = std::env::var_os("XDG_CACHE_HOME")
|
||||||
|
.map(PathBuf::from)
|
||||||
|
.or_else(|| std::env::var_os("HOME").map(|h| PathBuf::from(h).join(".cache")))?;
|
||||||
|
Some(base.join("paper"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Path to a cached markdown file for a given DOI.
|
||||||
|
/// DOI `10.1038/nature12373` maps to `<cache>/10.1038/nature12373.md`.
|
||||||
|
fn cache_path(doi: &str) -> Option<PathBuf> {
|
||||||
|
cache_dir().map(|d| d.join(format!("{doi}.md")))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_cache(doi: &str) -> Option<String> {
|
||||||
|
let path = cache_path(doi)?;
|
||||||
|
match std::fs::read_to_string(&path) {
|
||||||
|
Ok(content) => {
|
||||||
|
eprintln!("Using cached result from {}", path.display());
|
||||||
|
Some(content)
|
||||||
|
}
|
||||||
|
Err(_) => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_cache(doi: &str, markdown: &str) {
|
||||||
|
let Some(path) = cache_path(doi) else { return };
|
||||||
|
if let Some(parent) = path.parent() {
|
||||||
|
let _ = std::fs::create_dir_all(parent);
|
||||||
|
}
|
||||||
|
let _ = std::fs::write(&path, markdown);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Download
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
fn http_client() -> anyhow::Result<reqwest::blocking::Client> {
|
||||||
|
Ok(reqwest::blocking::Client::builder()
|
||||||
|
.user_agent(USER_AGENT)
|
||||||
|
.build()?)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Download a paper PDF by DOI.
|
||||||
|
///
|
||||||
|
/// Tries LibGen first (free, no JS challenge). If that fails and an Anna's
|
||||||
|
/// Archive API key is configured via `ANNAS_ARCHIVE_KEY`, tries the fast
|
||||||
|
/// download API as a fallback.
|
||||||
|
fn download_pdf(doi: &str) -> anyhow::Result<Vec<u8>> {
|
||||||
|
let client = http_client()?;
|
||||||
|
|
||||||
|
// Try LibGen first.
|
||||||
|
match download_via_libgen(&client, doi) {
|
||||||
|
Ok(bytes) => return Ok(bytes),
|
||||||
|
Err(e) => eprintln!("LibGen failed: {e:#}"),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try Anna's Archive fast download API if a key is available.
|
||||||
|
// This requires an MD5 — attempt to resolve one from LibGen even if the
|
||||||
|
// download itself failed (the search may have worked).
|
||||||
|
if let Ok(key) = std::env::var("ANNAS_ARCHIVE_KEY") {
|
||||||
|
if let Some(md5) = resolve_md5_from_libgen(&client, doi) {
|
||||||
|
match download_via_annas_archive(&client, &md5, &key) {
|
||||||
|
Ok(bytes) => return Ok(bytes),
|
||||||
|
Err(e) => eprintln!("Anna's Archive API failed: {e:#}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bail!("all download sources failed for DOI {doi}")
|
||||||
|
}
|
||||||
|
|
||||||
|
// -- LibGen -----------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Resolve a DOI to a paper MD5 via LibGen search + edition page.
|
||||||
|
fn resolve_md5_from_libgen(
|
||||||
|
client: &reqwest::blocking::Client,
|
||||||
|
doi: &str,
|
||||||
|
) -> Option<String> {
|
||||||
|
let edition_id = libgen_search(client, doi).ok()?;
|
||||||
|
libgen_edition_md5(client, &edition_id).ok()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Download a paper PDF from LibGen by DOI.
|
||||||
|
fn download_via_libgen(
|
||||||
|
client: &reqwest::blocking::Client,
|
||||||
|
doi: &str,
|
||||||
|
) -> anyhow::Result<Vec<u8>> {
|
||||||
|
eprintln!("Searching LibGen for DOI {doi}");
|
||||||
|
let edition_id = libgen_search(client, doi)?;
|
||||||
|
|
||||||
|
eprintln!("Found edition {edition_id}, resolving download link…");
|
||||||
|
let md5 = libgen_edition_md5(client, &edition_id)?;
|
||||||
|
let download_key = libgen_download_key(client, &md5)?;
|
||||||
|
|
||||||
|
let download_url = format!("{LIBGEN_BASE}/get.php?md5={md5}&key={download_key}");
|
||||||
|
eprintln!("Downloading PDF…");
|
||||||
|
|
||||||
|
let bytes = client
|
||||||
|
.get(&download_url)
|
||||||
|
.send()
|
||||||
|
.context("failed to request PDF from LibGen")?
|
||||||
|
.bytes()
|
||||||
|
.context("failed to read PDF body")?;
|
||||||
|
|
||||||
|
validate_pdf(&bytes)?;
|
||||||
|
Ok(bytes.to_vec())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Search LibGen by DOI and return the first matching edition ID.
|
||||||
|
fn libgen_search(
|
||||||
|
client: &reqwest::blocking::Client,
|
||||||
|
doi: &str,
|
||||||
|
) -> anyhow::Result<String> {
|
||||||
|
let url = format!("{LIBGEN_BASE}/index.php?req={doi}&topics%5B%5D=a&res=25");
|
||||||
|
let html = client
|
||||||
|
.get(&url)
|
||||||
|
.send()
|
||||||
|
.context("failed to search LibGen")?
|
||||||
|
.text()?;
|
||||||
|
|
||||||
|
let doc = Html::parse_document(&html);
|
||||||
|
let sel =
|
||||||
|
Selector::parse("a[href*='edition.php?id=']").expect("valid selector");
|
||||||
|
|
||||||
|
for el in doc.select(&sel) {
|
||||||
|
if let Some(href) = el.value().attr("href") {
|
||||||
|
if let Some(id) = href.strip_prefix("edition.php?id=") {
|
||||||
|
return Ok(id.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bail!("no results found on LibGen for DOI {doi}")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fetch a LibGen edition page and extract the file's MD5.
|
||||||
|
fn libgen_edition_md5(
|
||||||
|
client: &reqwest::blocking::Client,
|
||||||
|
edition_id: &str,
|
||||||
|
) -> anyhow::Result<String> {
|
||||||
|
let url = format!("{LIBGEN_BASE}/edition.php?id={edition_id}");
|
||||||
|
let html = client.get(&url).send()?.text()?;
|
||||||
|
|
||||||
|
let doc = Html::parse_document(&html);
|
||||||
|
let sel = Selector::parse("a[href*='ads.php?md5=']").expect("valid selector");
|
||||||
|
|
||||||
|
for el in doc.select(&sel) {
|
||||||
|
if let Some(href) = el.value().attr("href") {
|
||||||
|
if let Some(rest) = href.strip_prefix("ads.php?md5=") {
|
||||||
|
// href may have extra params after the md5
|
||||||
|
let md5 = rest.split('&').next().unwrap_or(rest);
|
||||||
|
return Ok(md5.to_string());
|
||||||
|
}
|
||||||
|
if let Some(rest) = href.strip_prefix("/ads.php?md5=") {
|
||||||
|
let md5 = rest.split('&').next().unwrap_or(rest);
|
||||||
|
return Ok(md5.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bail!("no download link found on edition page {edition_id}")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fetch the LibGen ads/download page for an MD5 and extract the one-time
|
||||||
|
/// download key.
|
||||||
|
fn libgen_download_key(
|
||||||
|
client: &reqwest::blocking::Client,
|
||||||
|
md5: &str,
|
||||||
|
) -> anyhow::Result<String> {
|
||||||
|
let url = format!("{LIBGEN_BASE}/ads.php?md5={md5}");
|
||||||
|
let html = client.get(&url).send()?.text()?;
|
||||||
|
|
||||||
|
let doc = Html::parse_document(&html);
|
||||||
|
let sel = Selector::parse("a[href*='get.php?md5=']").expect("valid selector");
|
||||||
|
|
||||||
|
for el in doc.select(&sel) {
|
||||||
|
if let Some(href) = el.value().attr("href") {
|
||||||
|
// Extract key= param from the get.php link
|
||||||
|
if let Some(idx) = href.find("key=") {
|
||||||
|
let key = &href[idx + 4..];
|
||||||
|
let key = key.split('&').next().unwrap_or(key);
|
||||||
|
if !key.is_empty() {
|
||||||
|
return Ok(key.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bail!("no download key found on LibGen ads page for md5 {md5}")
|
||||||
|
}
|
||||||
|
|
||||||
|
// -- Anna's Archive ---------------------------------------------------------
|
||||||
|
|
||||||
|
/// Download a paper PDF via the Anna's Archive fast download JSON API.
|
||||||
|
fn download_via_annas_archive(
|
||||||
|
client: &reqwest::blocking::Client,
|
||||||
|
md5: &str,
|
||||||
|
key: &str,
|
||||||
|
) -> anyhow::Result<Vec<u8>> {
|
||||||
|
eprintln!("Trying Anna's Archive fast download API…");
|
||||||
|
|
||||||
|
let api_url = format!(
|
||||||
|
"https://annas-archive.li/dyn/api/fast_download.json?md5={md5}&key={key}"
|
||||||
|
);
|
||||||
|
|
||||||
|
let resp: serde_json::Value = client
|
||||||
|
.get(&api_url)
|
||||||
|
.send()
|
||||||
|
.context("failed to call Anna's Archive API")?
|
||||||
|
.json()
|
||||||
|
.context("failed to parse Anna's Archive API response")?;
|
||||||
|
|
||||||
|
if let Some(err) = resp.get("error").and_then(|e| e.as_str()) {
|
||||||
|
if !err.is_empty() {
|
||||||
|
bail!("Anna's Archive API error: {err}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let download_url = resp
|
||||||
|
.get("download_url")
|
||||||
|
.and_then(|u| u.as_str())
|
||||||
|
.context("no download_url in Anna's Archive API response")?;
|
||||||
|
|
||||||
|
eprintln!("Downloading PDF from Anna's Archive…");
|
||||||
|
let bytes = client
|
||||||
|
.get(download_url)
|
||||||
|
.send()
|
||||||
|
.context("failed to download from Anna's Archive")?
|
||||||
|
.bytes()?;
|
||||||
|
|
||||||
|
validate_pdf(&bytes)?;
|
||||||
|
Ok(bytes.to_vec())
|
||||||
|
}
|
||||||
|
|
||||||
|
// -- Helpers ----------------------------------------------------------------
|
||||||
|
|
||||||
|
fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> {
|
||||||
|
if bytes.len() < 1024 {
|
||||||
|
bail!(
|
||||||
|
"downloaded file is suspiciously small ({} bytes) — may not be a valid PDF",
|
||||||
|
bytes.len()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Conversion
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Write PDF bytes to a temp file, run marker_single, and return the
|
||||||
|
/// resulting markdown.
|
||||||
|
fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result<String> {
|
||||||
|
let tmp_dir = tempfile::tempdir().context("failed to create temp directory")?;
|
||||||
|
let pdf_path = tmp_dir.path().join("paper.pdf");
|
||||||
|
let out_dir = tmp_dir.path().join("output");
|
||||||
|
|
||||||
|
std::fs::write(&pdf_path, pdf_bytes).context("failed to write temp PDF")?;
|
||||||
|
|
||||||
|
eprintln!("Converting PDF to markdown…");
|
||||||
|
|
||||||
|
let status = Command::new("marker_single")
|
||||||
|
.arg(&pdf_path)
|
||||||
|
.arg("--output_dir")
|
||||||
|
.arg(&out_dir)
|
||||||
|
.arg("--output_format")
|
||||||
|
.arg("markdown")
|
||||||
|
.status();
|
||||||
|
|
||||||
|
match status {
|
||||||
|
Err(e) if e.kind() == io::ErrorKind::NotFound => {
|
||||||
|
bail!("marker_single not found on PATH. Install it with:\n pip install marker-pdf");
|
||||||
|
}
|
||||||
|
Err(e) => bail!("failed to run marker_single: {e}"),
|
||||||
|
Ok(s) if !s.success() => bail!("marker_single exited with {s}"),
|
||||||
|
Ok(_) => {}
|
||||||
|
}
|
||||||
|
|
||||||
|
// marker_single creates a subdirectory inside our output dir — find
|
||||||
|
// the .md file within it.
|
||||||
|
find_markdown_file(&out_dir)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Recursively search a directory for the first .md file and read it.
|
||||||
|
fn find_markdown_file(dir: &std::path::Path) -> anyhow::Result<String> {
|
||||||
|
for entry in std::fs::read_dir(dir).context("failed to read marker output directory")? {
|
||||||
|
let entry = entry?;
|
||||||
|
let path = entry.path();
|
||||||
|
if path.is_dir() {
|
||||||
|
if let Ok(md) = find_markdown_file(&path) {
|
||||||
|
return Ok(md);
|
||||||
|
}
|
||||||
|
} else if path.extension().is_some_and(|ext| ext == "md") {
|
||||||
|
let mut content = String::new();
|
||||||
|
std::fs::File::open(&path)?.read_to_string(&mut content)?;
|
||||||
|
return Ok(content);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bail!("no .md file found in marker output")
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue