Add paper CLI: download academic papers by DOI and convert to markdown
Downloads PDFs from LibGen (primary) or Anna's Archive API (fallback), converts to markdown via marker_single, and prints to stdout. Includes XDG-compliant caching, nix flake with marker-pdf packaging, and a Claude Code skill for paper-reader integration. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
commit
f82b738db7
10 changed files with 2860 additions and 0 deletions
1
.envrc
Normal file
1
.envrc
Normal file
|
|
@ -0,0 +1 @@
|
|||
use flake .
|
||||
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
/target
|
||||
/result
|
||||
.direnv/
|
||||
2137
Cargo.lock
generated
Normal file
2137
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
12
Cargo.toml
Normal file
12
Cargo.toml
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
[package]
|
||||
name = "paper"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1"
|
||||
clap = { version = "4", features = ["derive"] }
|
||||
reqwest = { version = "0.12", features = ["blocking", "rustls-tls", "json"], default-features = false }
|
||||
scraper = "0.22"
|
||||
serde_json = "1"
|
||||
tempfile = "3"
|
||||
49
docs/plans/2026-02-19-paper-cli-design.md
Normal file
49
docs/plans/2026-02-19-paper-cli-design.md
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
# paper CLI — Design
|
||||
|
||||
A CLI tool that downloads academic papers by DOI from Anna's Archive and converts them to markdown.
|
||||
|
||||
## CLI Interface
|
||||
|
||||
```
|
||||
paper <DOI>
|
||||
```
|
||||
|
||||
Single positional argument. Markdown output goes to stdout.
|
||||
|
||||
```
|
||||
paper 10.1038/nature12373 > paper.md
|
||||
```
|
||||
|
||||
## Download Flow
|
||||
|
||||
1. Request `https://annas-archive.org/scidb/<DOI>` with a browser-like User-Agent
|
||||
2. Parse HTML for `<iframe>` or `<embed>` with `id="pdf"` — extract `src` for direct PDF URL
|
||||
3. Fallback: find any link ending in `.pdf`
|
||||
4. Download PDF to a temp file
|
||||
5. Exit with clear error if no PDF found
|
||||
|
||||
## Conversion
|
||||
|
||||
1. Shell out to `marker_single <tempfile.pdf> --output_dir <tempdir>`
|
||||
2. Read the generated `.md` file from the output dir
|
||||
3. Print to stdout
|
||||
4. Clean up temp dir
|
||||
|
||||
## Error Handling
|
||||
|
||||
- `marker_single` not on PATH: tell user to install (`pip install marker-pdf`)
|
||||
- Conversion failure: forward marker's stderr
|
||||
- Network errors: surface reqwest errors clearly
|
||||
- No PDF found: specific error message with the DOI
|
||||
|
||||
## Dependencies
|
||||
|
||||
- `clap` — argument parsing
|
||||
- `reqwest` (blocking, rustls-tls) — HTTP
|
||||
- `scraper` — HTML parsing
|
||||
- `tempfile` — temp directory
|
||||
- `anyhow` — error handling
|
||||
|
||||
## Dev Environment
|
||||
|
||||
The nix flake includes Rust nightly toolchain and marker-pdf in the devshell.
|
||||
82
flake.lock
generated
Normal file
82
flake.lock
generated
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
{
|
||||
"nodes": {
|
||||
"flake-utils": {
|
||||
"inputs": {
|
||||
"systems": "systems"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1731533236,
|
||||
"narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1771207753,
|
||||
"narHash": "sha256-b9uG8yN50DRQ6A7JdZBfzq718ryYrlmGgqkRm9OOwCE=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "d1c15b7d5806069da59e819999d70e1cec0760bf",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixpkgs-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"flake-utils": "flake-utils",
|
||||
"nixpkgs": "nixpkgs",
|
||||
"rust-overlay": "rust-overlay"
|
||||
}
|
||||
},
|
||||
"rust-overlay": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1771470520,
|
||||
"narHash": "sha256-PvytHcaYN5cPUll7FB70mXv1rRsIBRmu47fFfq3haxA=",
|
||||
"owner": "oxalica",
|
||||
"repo": "rust-overlay",
|
||||
"rev": "a1d4cc1f264c45d3745af0d2ca5e59d460e58777",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "oxalica",
|
||||
"repo": "rust-overlay",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"systems": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
55
flake.nix
Normal file
55
flake.nix
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
{
|
||||
description = "paper — download papers by DOI and convert to markdown";
|
||||
|
||||
inputs = {
|
||||
nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
|
||||
flake-utils.url = "github:numtide/flake-utils";
|
||||
rust-overlay = {
|
||||
url = "github:oxalica/rust-overlay";
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
};
|
||||
|
||||
outputs = { self, nixpkgs, flake-utils, rust-overlay }:
|
||||
flake-utils.lib.eachDefaultSystem (system:
|
||||
let
|
||||
overlays = [ (import rust-overlay) ];
|
||||
pkgs = import nixpkgs { inherit system overlays; };
|
||||
rust-nightly = pkgs.rust-bin.nightly.latest.default.override {
|
||||
extensions = [ "rust-src" "rust-analyzer" ];
|
||||
};
|
||||
|
||||
marker = import ./nix/marker.nix { inherit pkgs; };
|
||||
|
||||
paper = pkgs.rustPlatform.buildRustPackage {
|
||||
pname = "paper";
|
||||
version = "0.1.0";
|
||||
src = pkgs.lib.cleanSource ./.;
|
||||
cargoLock.lockFile = ./Cargo.lock;
|
||||
};
|
||||
|
||||
# Wrap the paper binary so marker_single is on PATH
|
||||
paper-wrapped = pkgs.symlinkJoin {
|
||||
name = "paper-${paper.version}";
|
||||
paths = [ paper ];
|
||||
nativeBuildInputs = [ pkgs.makeWrapper ];
|
||||
postBuild = ''
|
||||
wrapProgram $out/bin/paper \
|
||||
--prefix PATH : ${pkgs.lib.makeBinPath [ marker.markerEnv ]}
|
||||
'';
|
||||
};
|
||||
in
|
||||
{
|
||||
packages = {
|
||||
default = paper-wrapped;
|
||||
unwrapped = paper;
|
||||
};
|
||||
|
||||
devShells.default = pkgs.mkShell {
|
||||
buildInputs = [
|
||||
rust-nightly
|
||||
marker.markerEnv
|
||||
];
|
||||
};
|
||||
});
|
||||
}
|
||||
136
nix/marker.nix
Normal file
136
nix/marker.nix
Normal file
|
|
@ -0,0 +1,136 @@
|
|||
# Nix expressions for marker-pdf and its missing dependencies.
|
||||
{ pkgs }:
|
||||
|
||||
let
|
||||
python3Packages = pkgs.python3Packages;
|
||||
|
||||
# pypdfium2 4.30.0 — pinned because pdftext and surya-ocr require v4.x API.
|
||||
# Installed from the manylinux wheel which bundles libpdfium.
|
||||
pypdfium2 = python3Packages.buildPythonPackage rec {
|
||||
pname = "pypdfium2";
|
||||
version = "4.30.0";
|
||||
format = "wheel";
|
||||
|
||||
src = pkgs.fetchurl {
|
||||
url = "https://files.pythonhosted.org/packages/65/cd/3f1edf20a0ef4a212a5e20a5900e64942c5a374473671ac0780eaa08ea80/pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl";
|
||||
hash = "sha256-8feNIYng3fmsK3qbm9Twxm9U0Tif9sF+n9ncA00G6z8=";
|
||||
};
|
||||
|
||||
nativeBuildInputs = [ pkgs.autoPatchelfHook ];
|
||||
buildInputs = [ pkgs.stdenv.cc.cc.lib ];
|
||||
|
||||
pythonImportsCheck = [ "pypdfium2" ];
|
||||
};
|
||||
|
||||
pdftext = python3Packages.buildPythonPackage rec {
|
||||
pname = "pdftext";
|
||||
version = "0.6.3";
|
||||
pyproject = true;
|
||||
|
||||
src = pkgs.fetchPypi {
|
||||
inherit pname version;
|
||||
hash = "sha256-q1xd/g8ft43h24N8ytrB6kGwfOGJD+rZc8moTNr1Tew=";
|
||||
};
|
||||
|
||||
build-system = [ python3Packages.poetry-core ];
|
||||
nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
|
||||
pythonRelaxDeps = true;
|
||||
|
||||
dependencies = [
|
||||
pypdfium2
|
||||
python3Packages.pydantic
|
||||
python3Packages.pydantic-settings
|
||||
python3Packages.click
|
||||
];
|
||||
|
||||
# Tests require PDF fixtures not included in the sdist
|
||||
doCheck = false;
|
||||
pythonImportsCheck = [ "pdftext" ];
|
||||
};
|
||||
|
||||
surya-ocr = python3Packages.buildPythonPackage rec {
|
||||
pname = "surya-ocr";
|
||||
version = "0.17.1";
|
||||
pyproject = true;
|
||||
|
||||
src = pkgs.fetchPypi {
|
||||
pname = "surya_ocr";
|
||||
inherit version;
|
||||
hash = "sha256-NJ142FTB7V+Bblg1Re1kUaoLxpkuKDqAUDR5mqzujCQ=";
|
||||
};
|
||||
|
||||
build-system = [ python3Packages.poetry-core ];
|
||||
nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
|
||||
pythonRelaxDeps = true;
|
||||
pythonRemoveDeps = [ "pre-commit" ];
|
||||
|
||||
dependencies = [
|
||||
python3Packages.transformers
|
||||
python3Packages.torch
|
||||
python3Packages.pydantic
|
||||
python3Packages.pydantic-settings
|
||||
python3Packages.python-dotenv
|
||||
python3Packages.pillow
|
||||
pypdfium2
|
||||
python3Packages.filetype
|
||||
python3Packages.click
|
||||
python3Packages.platformdirs
|
||||
python3Packages.opencv-python-headless
|
||||
python3Packages.einops
|
||||
];
|
||||
|
||||
# Tests require model weights and GPU
|
||||
doCheck = false;
|
||||
pythonImportsCheck = [ "surya" ];
|
||||
};
|
||||
|
||||
marker-pdf = python3Packages.buildPythonPackage rec {
|
||||
pname = "marker-pdf";
|
||||
version = "1.10.2";
|
||||
pyproject = true;
|
||||
|
||||
src = pkgs.fetchPypi {
|
||||
pname = "marker_pdf";
|
||||
inherit version;
|
||||
hash = "sha256-zg/IOeEa11GaV20lTKnVGg+UVLnX2gIhH3IrFBMX+fE=";
|
||||
};
|
||||
|
||||
build-system = [ python3Packages.poetry-core ];
|
||||
nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
|
||||
pythonRelaxDeps = true;
|
||||
pythonRemoveDeps = [ "pre-commit" ];
|
||||
|
||||
dependencies = [
|
||||
python3Packages.pillow
|
||||
python3Packages.pydantic
|
||||
python3Packages.pydantic-settings
|
||||
python3Packages.transformers
|
||||
python3Packages.python-dotenv
|
||||
python3Packages.torch
|
||||
python3Packages.tqdm
|
||||
python3Packages.ftfy
|
||||
python3Packages.rapidfuzz
|
||||
surya-ocr
|
||||
python3Packages.regex
|
||||
pdftext
|
||||
python3Packages.markdownify
|
||||
python3Packages.click
|
||||
python3Packages.markdown2
|
||||
python3Packages.filetype
|
||||
python3Packages.google-genai
|
||||
python3Packages.anthropic
|
||||
python3Packages.scikit-learn
|
||||
python3Packages.openai
|
||||
];
|
||||
|
||||
# Tests require model weights
|
||||
doCheck = false;
|
||||
pythonImportsCheck = [ "marker" ];
|
||||
};
|
||||
in
|
||||
{
|
||||
inherit pypdfium2 pdftext surya-ocr marker-pdf;
|
||||
|
||||
# Python environment with marker_single on PATH
|
||||
markerEnv = python3Packages.python.withPackages (_: [ marker-pdf ]);
|
||||
}
|
||||
41
skill/SKILL.md
Normal file
41
skill/SKILL.md
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
---
|
||||
name: paper-reader
|
||||
description: Fetch and read academic papers by DOI. Use when (1) the user mentions a DOI (e.g., 10.1038/nature12373), asks to read/summarize/analyze a research paper, or references a paper they want to work with, or (2) Claude needs to consult a specific paper as part of research — e.g., a web search returns a relevant DOI, or a cited paper would help answer the user's question. Converts PDFs to markdown so the paper content can be read and discussed.
|
||||
---
|
||||
|
||||
# Paper Reader
|
||||
|
||||
Fetch academic papers by DOI using the `paper` CLI, which downloads PDFs and converts them to markdown via `marker_single`.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
# Fetch a paper and save to a temp file, then read it
|
||||
paper <DOI> > /tmp/paper.md
|
||||
|
||||
# Bypass cache to re-download
|
||||
paper --no-cache <DOI> > /tmp/paper.md
|
||||
```
|
||||
|
||||
Output goes to stdout (markdown). Progress/status goes to stderr. DOIs can be passed with or without the `https://doi.org/` prefix.
|
||||
|
||||
## Workflow
|
||||
|
||||
1. Extract the DOI from the user's message (look for patterns like `10.xxxx/...`)
|
||||
2. Run `paper <DOI> > /tmp/paper-<sanitized-doi>.md` via Bash
|
||||
3. Read the resulting markdown file
|
||||
4. Respond to what the user asked (summarize, explain, answer questions, etc.)
|
||||
|
||||
## Caching
|
||||
|
||||
Results are cached at `~/.cache/paper/<DOI>.md`. Subsequent requests for the same DOI return instantly. Use `--no-cache` only when the user explicitly wants a fresh conversion.
|
||||
|
||||
## Download Sources
|
||||
|
||||
The tool tries LibGen first (free, no authentication), then falls back to Anna's Archive fast download API if `ANNAS_ARCHIVE_KEY` is set.
|
||||
|
||||
## Errors
|
||||
|
||||
- **"marker_single not found"**: The `marker_single` Python tool is not installed. Run `pip install marker-pdf` or use the nix devshell in `~/proj/paper`.
|
||||
- **"no results found on LibGen"**: The DOI may not be in LibGen's collection. Verify the DOI is correct.
|
||||
- **"all download sources failed"**: Neither LibGen nor Anna's Archive had the paper. The user may need to find it manually.
|
||||
344
src/main.rs
Normal file
344
src/main.rs
Normal file
|
|
@ -0,0 +1,344 @@
|
|||
use std::io::{self, Read};
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
|
||||
use anyhow::{Context, bail};
|
||||
use clap::Parser;
|
||||
use scraper::{Html, Selector};
|
||||
|
||||
const USER_AGENT: &str =
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0";
|
||||
const LIBGEN_BASE: &str = "https://libgen.li";
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(about = "Download a paper by DOI and convert it to markdown")]
|
||||
struct Args {
|
||||
/// The DOI of the paper to download
|
||||
doi: String,
|
||||
|
||||
/// Skip the cache and re-download/re-convert
|
||||
#[arg(long)]
|
||||
no_cache: bool,
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let args = Args::parse();
|
||||
let doi = args.doi.trim_start_matches("https://doi.org/");
|
||||
|
||||
if !args.no_cache {
|
||||
if let Some(cached) = read_cache(doi) {
|
||||
print!("{cached}");
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
let pdf_bytes = download_pdf(doi)?;
|
||||
let markdown = convert_to_markdown(&pdf_bytes)?;
|
||||
|
||||
write_cache(doi, &markdown);
|
||||
print!("{markdown}");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Cache
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Return the cache directory: `$XDG_CACHE_HOME/paper` or `~/.cache/paper`.
|
||||
fn cache_dir() -> Option<PathBuf> {
|
||||
let base = std::env::var_os("XDG_CACHE_HOME")
|
||||
.map(PathBuf::from)
|
||||
.or_else(|| std::env::var_os("HOME").map(|h| PathBuf::from(h).join(".cache")))?;
|
||||
Some(base.join("paper"))
|
||||
}
|
||||
|
||||
/// Path to a cached markdown file for a given DOI.
|
||||
/// DOI `10.1038/nature12373` maps to `<cache>/10.1038/nature12373.md`.
|
||||
fn cache_path(doi: &str) -> Option<PathBuf> {
|
||||
cache_dir().map(|d| d.join(format!("{doi}.md")))
|
||||
}
|
||||
|
||||
fn read_cache(doi: &str) -> Option<String> {
|
||||
let path = cache_path(doi)?;
|
||||
match std::fs::read_to_string(&path) {
|
||||
Ok(content) => {
|
||||
eprintln!("Using cached result from {}", path.display());
|
||||
Some(content)
|
||||
}
|
||||
Err(_) => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn write_cache(doi: &str, markdown: &str) {
|
||||
let Some(path) = cache_path(doi) else { return };
|
||||
if let Some(parent) = path.parent() {
|
||||
let _ = std::fs::create_dir_all(parent);
|
||||
}
|
||||
let _ = std::fs::write(&path, markdown);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Download
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn http_client() -> anyhow::Result<reqwest::blocking::Client> {
|
||||
Ok(reqwest::blocking::Client::builder()
|
||||
.user_agent(USER_AGENT)
|
||||
.build()?)
|
||||
}
|
||||
|
||||
/// Download a paper PDF by DOI.
|
||||
///
|
||||
/// Tries LibGen first (free, no JS challenge). If that fails and an Anna's
|
||||
/// Archive API key is configured via `ANNAS_ARCHIVE_KEY`, tries the fast
|
||||
/// download API as a fallback.
|
||||
fn download_pdf(doi: &str) -> anyhow::Result<Vec<u8>> {
|
||||
let client = http_client()?;
|
||||
|
||||
// Try LibGen first.
|
||||
match download_via_libgen(&client, doi) {
|
||||
Ok(bytes) => return Ok(bytes),
|
||||
Err(e) => eprintln!("LibGen failed: {e:#}"),
|
||||
}
|
||||
|
||||
// Try Anna's Archive fast download API if a key is available.
|
||||
// This requires an MD5 — attempt to resolve one from LibGen even if the
|
||||
// download itself failed (the search may have worked).
|
||||
if let Ok(key) = std::env::var("ANNAS_ARCHIVE_KEY") {
|
||||
if let Some(md5) = resolve_md5_from_libgen(&client, doi) {
|
||||
match download_via_annas_archive(&client, &md5, &key) {
|
||||
Ok(bytes) => return Ok(bytes),
|
||||
Err(e) => eprintln!("Anna's Archive API failed: {e:#}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bail!("all download sources failed for DOI {doi}")
|
||||
}
|
||||
|
||||
// -- LibGen -----------------------------------------------------------------
|
||||
|
||||
/// Resolve a DOI to a paper MD5 via LibGen search + edition page.
|
||||
fn resolve_md5_from_libgen(
|
||||
client: &reqwest::blocking::Client,
|
||||
doi: &str,
|
||||
) -> Option<String> {
|
||||
let edition_id = libgen_search(client, doi).ok()?;
|
||||
libgen_edition_md5(client, &edition_id).ok()
|
||||
}
|
||||
|
||||
/// Download a paper PDF from LibGen by DOI.
|
||||
fn download_via_libgen(
|
||||
client: &reqwest::blocking::Client,
|
||||
doi: &str,
|
||||
) -> anyhow::Result<Vec<u8>> {
|
||||
eprintln!("Searching LibGen for DOI {doi}");
|
||||
let edition_id = libgen_search(client, doi)?;
|
||||
|
||||
eprintln!("Found edition {edition_id}, resolving download link…");
|
||||
let md5 = libgen_edition_md5(client, &edition_id)?;
|
||||
let download_key = libgen_download_key(client, &md5)?;
|
||||
|
||||
let download_url = format!("{LIBGEN_BASE}/get.php?md5={md5}&key={download_key}");
|
||||
eprintln!("Downloading PDF…");
|
||||
|
||||
let bytes = client
|
||||
.get(&download_url)
|
||||
.send()
|
||||
.context("failed to request PDF from LibGen")?
|
||||
.bytes()
|
||||
.context("failed to read PDF body")?;
|
||||
|
||||
validate_pdf(&bytes)?;
|
||||
Ok(bytes.to_vec())
|
||||
}
|
||||
|
||||
/// Search LibGen by DOI and return the first matching edition ID.
|
||||
fn libgen_search(
|
||||
client: &reqwest::blocking::Client,
|
||||
doi: &str,
|
||||
) -> anyhow::Result<String> {
|
||||
let url = format!("{LIBGEN_BASE}/index.php?req={doi}&topics%5B%5D=a&res=25");
|
||||
let html = client
|
||||
.get(&url)
|
||||
.send()
|
||||
.context("failed to search LibGen")?
|
||||
.text()?;
|
||||
|
||||
let doc = Html::parse_document(&html);
|
||||
let sel =
|
||||
Selector::parse("a[href*='edition.php?id=']").expect("valid selector");
|
||||
|
||||
for el in doc.select(&sel) {
|
||||
if let Some(href) = el.value().attr("href") {
|
||||
if let Some(id) = href.strip_prefix("edition.php?id=") {
|
||||
return Ok(id.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
bail!("no results found on LibGen for DOI {doi}")
|
||||
}
|
||||
|
||||
/// Fetch a LibGen edition page and extract the file's MD5.
|
||||
fn libgen_edition_md5(
|
||||
client: &reqwest::blocking::Client,
|
||||
edition_id: &str,
|
||||
) -> anyhow::Result<String> {
|
||||
let url = format!("{LIBGEN_BASE}/edition.php?id={edition_id}");
|
||||
let html = client.get(&url).send()?.text()?;
|
||||
|
||||
let doc = Html::parse_document(&html);
|
||||
let sel = Selector::parse("a[href*='ads.php?md5=']").expect("valid selector");
|
||||
|
||||
for el in doc.select(&sel) {
|
||||
if let Some(href) = el.value().attr("href") {
|
||||
if let Some(rest) = href.strip_prefix("ads.php?md5=") {
|
||||
// href may have extra params after the md5
|
||||
let md5 = rest.split('&').next().unwrap_or(rest);
|
||||
return Ok(md5.to_string());
|
||||
}
|
||||
if let Some(rest) = href.strip_prefix("/ads.php?md5=") {
|
||||
let md5 = rest.split('&').next().unwrap_or(rest);
|
||||
return Ok(md5.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
bail!("no download link found on edition page {edition_id}")
|
||||
}
|
||||
|
||||
/// Fetch the LibGen ads/download page for an MD5 and extract the one-time
|
||||
/// download key.
|
||||
fn libgen_download_key(
|
||||
client: &reqwest::blocking::Client,
|
||||
md5: &str,
|
||||
) -> anyhow::Result<String> {
|
||||
let url = format!("{LIBGEN_BASE}/ads.php?md5={md5}");
|
||||
let html = client.get(&url).send()?.text()?;
|
||||
|
||||
let doc = Html::parse_document(&html);
|
||||
let sel = Selector::parse("a[href*='get.php?md5=']").expect("valid selector");
|
||||
|
||||
for el in doc.select(&sel) {
|
||||
if let Some(href) = el.value().attr("href") {
|
||||
// Extract key= param from the get.php link
|
||||
if let Some(idx) = href.find("key=") {
|
||||
let key = &href[idx + 4..];
|
||||
let key = key.split('&').next().unwrap_or(key);
|
||||
if !key.is_empty() {
|
||||
return Ok(key.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
bail!("no download key found on LibGen ads page for md5 {md5}")
|
||||
}
|
||||
|
||||
// -- Anna's Archive ---------------------------------------------------------
|
||||
|
||||
/// Download a paper PDF via the Anna's Archive fast download JSON API.
|
||||
fn download_via_annas_archive(
|
||||
client: &reqwest::blocking::Client,
|
||||
md5: &str,
|
||||
key: &str,
|
||||
) -> anyhow::Result<Vec<u8>> {
|
||||
eprintln!("Trying Anna's Archive fast download API…");
|
||||
|
||||
let api_url = format!(
|
||||
"https://annas-archive.li/dyn/api/fast_download.json?md5={md5}&key={key}"
|
||||
);
|
||||
|
||||
let resp: serde_json::Value = client
|
||||
.get(&api_url)
|
||||
.send()
|
||||
.context("failed to call Anna's Archive API")?
|
||||
.json()
|
||||
.context("failed to parse Anna's Archive API response")?;
|
||||
|
||||
if let Some(err) = resp.get("error").and_then(|e| e.as_str()) {
|
||||
if !err.is_empty() {
|
||||
bail!("Anna's Archive API error: {err}");
|
||||
}
|
||||
}
|
||||
|
||||
let download_url = resp
|
||||
.get("download_url")
|
||||
.and_then(|u| u.as_str())
|
||||
.context("no download_url in Anna's Archive API response")?;
|
||||
|
||||
eprintln!("Downloading PDF from Anna's Archive…");
|
||||
let bytes = client
|
||||
.get(download_url)
|
||||
.send()
|
||||
.context("failed to download from Anna's Archive")?
|
||||
.bytes()?;
|
||||
|
||||
validate_pdf(&bytes)?;
|
||||
Ok(bytes.to_vec())
|
||||
}
|
||||
|
||||
// -- Helpers ----------------------------------------------------------------
|
||||
|
||||
fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> {
|
||||
if bytes.len() < 1024 {
|
||||
bail!(
|
||||
"downloaded file is suspiciously small ({} bytes) — may not be a valid PDF",
|
||||
bytes.len()
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Conversion
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Write PDF bytes to a temp file, run marker_single, and return the
|
||||
/// resulting markdown.
|
||||
fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result<String> {
|
||||
let tmp_dir = tempfile::tempdir().context("failed to create temp directory")?;
|
||||
let pdf_path = tmp_dir.path().join("paper.pdf");
|
||||
let out_dir = tmp_dir.path().join("output");
|
||||
|
||||
std::fs::write(&pdf_path, pdf_bytes).context("failed to write temp PDF")?;
|
||||
|
||||
eprintln!("Converting PDF to markdown…");
|
||||
|
||||
let status = Command::new("marker_single")
|
||||
.arg(&pdf_path)
|
||||
.arg("--output_dir")
|
||||
.arg(&out_dir)
|
||||
.arg("--output_format")
|
||||
.arg("markdown")
|
||||
.status();
|
||||
|
||||
match status {
|
||||
Err(e) if e.kind() == io::ErrorKind::NotFound => {
|
||||
bail!("marker_single not found on PATH. Install it with:\n pip install marker-pdf");
|
||||
}
|
||||
Err(e) => bail!("failed to run marker_single: {e}"),
|
||||
Ok(s) if !s.success() => bail!("marker_single exited with {s}"),
|
||||
Ok(_) => {}
|
||||
}
|
||||
|
||||
// marker_single creates a subdirectory inside our output dir — find
|
||||
// the .md file within it.
|
||||
find_markdown_file(&out_dir)
|
||||
}
|
||||
|
||||
/// Recursively search a directory for the first .md file and read it.
|
||||
fn find_markdown_file(dir: &std::path::Path) -> anyhow::Result<String> {
|
||||
for entry in std::fs::read_dir(dir).context("failed to read marker output directory")? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
if path.is_dir() {
|
||||
if let Ok(md) = find_markdown_file(&path) {
|
||||
return Ok(md);
|
||||
}
|
||||
} else if path.extension().is_some_and(|ext| ext == "md") {
|
||||
let mut content = String::new();
|
||||
std::fs::File::open(&path)?.read_to_string(&mut content)?;
|
||||
return Ok(content);
|
||||
}
|
||||
}
|
||||
bail!("no .md file found in marker output")
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue