Add paper CLI: download academic papers by DOI and convert to markdown
Downloads PDFs from LibGen (primary) or Anna's Archive API (fallback), converts to markdown via marker_single, and prints to stdout. Includes XDG-compliant caching, nix flake with marker-pdf packaging, and a Claude Code skill for paper-reader integration. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
commit
f82b738db7
10 changed files with 2860 additions and 0 deletions
344
src/main.rs
Normal file
344
src/main.rs
Normal file
|
|
@ -0,0 +1,344 @@
|
|||
use std::io::{self, Read};
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
|
||||
use anyhow::{Context, bail};
|
||||
use clap::Parser;
|
||||
use scraper::{Html, Selector};
|
||||
|
||||
const USER_AGENT: &str =
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0";
|
||||
const LIBGEN_BASE: &str = "https://libgen.li";
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(about = "Download a paper by DOI and convert it to markdown")]
|
||||
struct Args {
|
||||
/// The DOI of the paper to download
|
||||
doi: String,
|
||||
|
||||
/// Skip the cache and re-download/re-convert
|
||||
#[arg(long)]
|
||||
no_cache: bool,
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let args = Args::parse();
|
||||
let doi = args.doi.trim_start_matches("https://doi.org/");
|
||||
|
||||
if !args.no_cache {
|
||||
if let Some(cached) = read_cache(doi) {
|
||||
print!("{cached}");
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
let pdf_bytes = download_pdf(doi)?;
|
||||
let markdown = convert_to_markdown(&pdf_bytes)?;
|
||||
|
||||
write_cache(doi, &markdown);
|
||||
print!("{markdown}");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Cache
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Return the cache directory: `$XDG_CACHE_HOME/paper` or `~/.cache/paper`.
|
||||
fn cache_dir() -> Option<PathBuf> {
|
||||
let base = std::env::var_os("XDG_CACHE_HOME")
|
||||
.map(PathBuf::from)
|
||||
.or_else(|| std::env::var_os("HOME").map(|h| PathBuf::from(h).join(".cache")))?;
|
||||
Some(base.join("paper"))
|
||||
}
|
||||
|
||||
/// Path to a cached markdown file for a given DOI.
|
||||
/// DOI `10.1038/nature12373` maps to `<cache>/10.1038/nature12373.md`.
|
||||
fn cache_path(doi: &str) -> Option<PathBuf> {
|
||||
cache_dir().map(|d| d.join(format!("{doi}.md")))
|
||||
}
|
||||
|
||||
fn read_cache(doi: &str) -> Option<String> {
|
||||
let path = cache_path(doi)?;
|
||||
match std::fs::read_to_string(&path) {
|
||||
Ok(content) => {
|
||||
eprintln!("Using cached result from {}", path.display());
|
||||
Some(content)
|
||||
}
|
||||
Err(_) => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn write_cache(doi: &str, markdown: &str) {
|
||||
let Some(path) = cache_path(doi) else { return };
|
||||
if let Some(parent) = path.parent() {
|
||||
let _ = std::fs::create_dir_all(parent);
|
||||
}
|
||||
let _ = std::fs::write(&path, markdown);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Download
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn http_client() -> anyhow::Result<reqwest::blocking::Client> {
|
||||
Ok(reqwest::blocking::Client::builder()
|
||||
.user_agent(USER_AGENT)
|
||||
.build()?)
|
||||
}
|
||||
|
||||
/// Download a paper PDF by DOI.
|
||||
///
|
||||
/// Tries LibGen first (free, no JS challenge). If that fails and an Anna's
|
||||
/// Archive API key is configured via `ANNAS_ARCHIVE_KEY`, tries the fast
|
||||
/// download API as a fallback.
|
||||
fn download_pdf(doi: &str) -> anyhow::Result<Vec<u8>> {
|
||||
let client = http_client()?;
|
||||
|
||||
// Try LibGen first.
|
||||
match download_via_libgen(&client, doi) {
|
||||
Ok(bytes) => return Ok(bytes),
|
||||
Err(e) => eprintln!("LibGen failed: {e:#}"),
|
||||
}
|
||||
|
||||
// Try Anna's Archive fast download API if a key is available.
|
||||
// This requires an MD5 — attempt to resolve one from LibGen even if the
|
||||
// download itself failed (the search may have worked).
|
||||
if let Ok(key) = std::env::var("ANNAS_ARCHIVE_KEY") {
|
||||
if let Some(md5) = resolve_md5_from_libgen(&client, doi) {
|
||||
match download_via_annas_archive(&client, &md5, &key) {
|
||||
Ok(bytes) => return Ok(bytes),
|
||||
Err(e) => eprintln!("Anna's Archive API failed: {e:#}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bail!("all download sources failed for DOI {doi}")
|
||||
}
|
||||
|
||||
// -- LibGen -----------------------------------------------------------------
|
||||
|
||||
/// Resolve a DOI to a paper MD5 via LibGen search + edition page.
|
||||
fn resolve_md5_from_libgen(
|
||||
client: &reqwest::blocking::Client,
|
||||
doi: &str,
|
||||
) -> Option<String> {
|
||||
let edition_id = libgen_search(client, doi).ok()?;
|
||||
libgen_edition_md5(client, &edition_id).ok()
|
||||
}
|
||||
|
||||
/// Download a paper PDF from LibGen by DOI.
|
||||
fn download_via_libgen(
|
||||
client: &reqwest::blocking::Client,
|
||||
doi: &str,
|
||||
) -> anyhow::Result<Vec<u8>> {
|
||||
eprintln!("Searching LibGen for DOI {doi}");
|
||||
let edition_id = libgen_search(client, doi)?;
|
||||
|
||||
eprintln!("Found edition {edition_id}, resolving download link…");
|
||||
let md5 = libgen_edition_md5(client, &edition_id)?;
|
||||
let download_key = libgen_download_key(client, &md5)?;
|
||||
|
||||
let download_url = format!("{LIBGEN_BASE}/get.php?md5={md5}&key={download_key}");
|
||||
eprintln!("Downloading PDF…");
|
||||
|
||||
let bytes = client
|
||||
.get(&download_url)
|
||||
.send()
|
||||
.context("failed to request PDF from LibGen")?
|
||||
.bytes()
|
||||
.context("failed to read PDF body")?;
|
||||
|
||||
validate_pdf(&bytes)?;
|
||||
Ok(bytes.to_vec())
|
||||
}
|
||||
|
||||
/// Search LibGen by DOI and return the first matching edition ID.
|
||||
fn libgen_search(
|
||||
client: &reqwest::blocking::Client,
|
||||
doi: &str,
|
||||
) -> anyhow::Result<String> {
|
||||
let url = format!("{LIBGEN_BASE}/index.php?req={doi}&topics%5B%5D=a&res=25");
|
||||
let html = client
|
||||
.get(&url)
|
||||
.send()
|
||||
.context("failed to search LibGen")?
|
||||
.text()?;
|
||||
|
||||
let doc = Html::parse_document(&html);
|
||||
let sel =
|
||||
Selector::parse("a[href*='edition.php?id=']").expect("valid selector");
|
||||
|
||||
for el in doc.select(&sel) {
|
||||
if let Some(href) = el.value().attr("href") {
|
||||
if let Some(id) = href.strip_prefix("edition.php?id=") {
|
||||
return Ok(id.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
bail!("no results found on LibGen for DOI {doi}")
|
||||
}
|
||||
|
||||
/// Fetch a LibGen edition page and extract the file's MD5.
|
||||
fn libgen_edition_md5(
|
||||
client: &reqwest::blocking::Client,
|
||||
edition_id: &str,
|
||||
) -> anyhow::Result<String> {
|
||||
let url = format!("{LIBGEN_BASE}/edition.php?id={edition_id}");
|
||||
let html = client.get(&url).send()?.text()?;
|
||||
|
||||
let doc = Html::parse_document(&html);
|
||||
let sel = Selector::parse("a[href*='ads.php?md5=']").expect("valid selector");
|
||||
|
||||
for el in doc.select(&sel) {
|
||||
if let Some(href) = el.value().attr("href") {
|
||||
if let Some(rest) = href.strip_prefix("ads.php?md5=") {
|
||||
// href may have extra params after the md5
|
||||
let md5 = rest.split('&').next().unwrap_or(rest);
|
||||
return Ok(md5.to_string());
|
||||
}
|
||||
if let Some(rest) = href.strip_prefix("/ads.php?md5=") {
|
||||
let md5 = rest.split('&').next().unwrap_or(rest);
|
||||
return Ok(md5.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
bail!("no download link found on edition page {edition_id}")
|
||||
}
|
||||
|
||||
/// Fetch the LibGen ads/download page for an MD5 and extract the one-time
|
||||
/// download key.
|
||||
fn libgen_download_key(
|
||||
client: &reqwest::blocking::Client,
|
||||
md5: &str,
|
||||
) -> anyhow::Result<String> {
|
||||
let url = format!("{LIBGEN_BASE}/ads.php?md5={md5}");
|
||||
let html = client.get(&url).send()?.text()?;
|
||||
|
||||
let doc = Html::parse_document(&html);
|
||||
let sel = Selector::parse("a[href*='get.php?md5=']").expect("valid selector");
|
||||
|
||||
for el in doc.select(&sel) {
|
||||
if let Some(href) = el.value().attr("href") {
|
||||
// Extract key= param from the get.php link
|
||||
if let Some(idx) = href.find("key=") {
|
||||
let key = &href[idx + 4..];
|
||||
let key = key.split('&').next().unwrap_or(key);
|
||||
if !key.is_empty() {
|
||||
return Ok(key.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
bail!("no download key found on LibGen ads page for md5 {md5}")
|
||||
}
|
||||
|
||||
// -- Anna's Archive ---------------------------------------------------------
|
||||
|
||||
/// Download a paper PDF via the Anna's Archive fast download JSON API.
|
||||
fn download_via_annas_archive(
|
||||
client: &reqwest::blocking::Client,
|
||||
md5: &str,
|
||||
key: &str,
|
||||
) -> anyhow::Result<Vec<u8>> {
|
||||
eprintln!("Trying Anna's Archive fast download API…");
|
||||
|
||||
let api_url = format!(
|
||||
"https://annas-archive.li/dyn/api/fast_download.json?md5={md5}&key={key}"
|
||||
);
|
||||
|
||||
let resp: serde_json::Value = client
|
||||
.get(&api_url)
|
||||
.send()
|
||||
.context("failed to call Anna's Archive API")?
|
||||
.json()
|
||||
.context("failed to parse Anna's Archive API response")?;
|
||||
|
||||
if let Some(err) = resp.get("error").and_then(|e| e.as_str()) {
|
||||
if !err.is_empty() {
|
||||
bail!("Anna's Archive API error: {err}");
|
||||
}
|
||||
}
|
||||
|
||||
let download_url = resp
|
||||
.get("download_url")
|
||||
.and_then(|u| u.as_str())
|
||||
.context("no download_url in Anna's Archive API response")?;
|
||||
|
||||
eprintln!("Downloading PDF from Anna's Archive…");
|
||||
let bytes = client
|
||||
.get(download_url)
|
||||
.send()
|
||||
.context("failed to download from Anna's Archive")?
|
||||
.bytes()?;
|
||||
|
||||
validate_pdf(&bytes)?;
|
||||
Ok(bytes.to_vec())
|
||||
}
|
||||
|
||||
// -- Helpers ----------------------------------------------------------------
|
||||
|
||||
fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> {
|
||||
if bytes.len() < 1024 {
|
||||
bail!(
|
||||
"downloaded file is suspiciously small ({} bytes) — may not be a valid PDF",
|
||||
bytes.len()
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Conversion
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Write PDF bytes to a temp file, run marker_single, and return the
|
||||
/// resulting markdown.
|
||||
fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result<String> {
|
||||
let tmp_dir = tempfile::tempdir().context("failed to create temp directory")?;
|
||||
let pdf_path = tmp_dir.path().join("paper.pdf");
|
||||
let out_dir = tmp_dir.path().join("output");
|
||||
|
||||
std::fs::write(&pdf_path, pdf_bytes).context("failed to write temp PDF")?;
|
||||
|
||||
eprintln!("Converting PDF to markdown…");
|
||||
|
||||
let status = Command::new("marker_single")
|
||||
.arg(&pdf_path)
|
||||
.arg("--output_dir")
|
||||
.arg(&out_dir)
|
||||
.arg("--output_format")
|
||||
.arg("markdown")
|
||||
.status();
|
||||
|
||||
match status {
|
||||
Err(e) if e.kind() == io::ErrorKind::NotFound => {
|
||||
bail!("marker_single not found on PATH. Install it with:\n pip install marker-pdf");
|
||||
}
|
||||
Err(e) => bail!("failed to run marker_single: {e}"),
|
||||
Ok(s) if !s.success() => bail!("marker_single exited with {s}"),
|
||||
Ok(_) => {}
|
||||
}
|
||||
|
||||
// marker_single creates a subdirectory inside our output dir — find
|
||||
// the .md file within it.
|
||||
find_markdown_file(&out_dir)
|
||||
}
|
||||
|
||||
/// Recursively search a directory for the first .md file and read it.
|
||||
fn find_markdown_file(dir: &std::path::Path) -> anyhow::Result<String> {
|
||||
for entry in std::fs::read_dir(dir).context("failed to read marker output directory")? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
if path.is_dir() {
|
||||
if let Ok(md) = find_markdown_file(&path) {
|
||||
return Ok(md);
|
||||
}
|
||||
} else if path.extension().is_some_and(|ext| ext == "md") {
|
||||
let mut content = String::new();
|
||||
std::fs::File::open(&path)?.read_to_string(&mut content)?;
|
||||
return Ok(content);
|
||||
}
|
||||
}
|
||||
bail!("no .md file found in marker output")
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue