use std::io::{self, Read}; use std::path::PathBuf; use std::process::Command; use anyhow::{Context, bail}; use clap::Parser; use scraper::{Html, Selector}; const USER_AGENT: &str = "Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0"; const LIBGEN_BASE: &str = "https://libgen.li"; #[derive(Parser)] #[command(about = "Download a paper by DOI and convert it to markdown")] struct Args { /// The DOI of the paper to download doi: String, /// Skip the cache and re-download/re-convert #[arg(long)] no_cache: bool, } fn main() -> anyhow::Result<()> { let args = Args::parse(); let doi = args.doi.trim_start_matches("https://doi.org/"); if !args.no_cache { if let Some(cached) = read_cache(doi) { print!("{cached}"); return Ok(()); } } let pdf_bytes = download_pdf(doi)?; let markdown = convert_to_markdown(&pdf_bytes)?; write_cache(doi, &markdown); print!("{markdown}"); Ok(()) } // --------------------------------------------------------------------------- // Cache // --------------------------------------------------------------------------- /// Return the cache directory: `$XDG_CACHE_HOME/paper` or `~/.cache/paper`. fn cache_dir() -> Option { let base = std::env::var_os("XDG_CACHE_HOME") .map(PathBuf::from) .or_else(|| std::env::var_os("HOME").map(|h| PathBuf::from(h).join(".cache")))?; Some(base.join("paper")) } /// Path to a cached markdown file for a given DOI. /// DOI `10.1038/nature12373` maps to `/10.1038/nature12373.md`. fn cache_path(doi: &str) -> Option { cache_dir().map(|d| d.join(format!("{doi}.md"))) } fn read_cache(doi: &str) -> Option { let path = cache_path(doi)?; match std::fs::read_to_string(&path) { Ok(content) => { eprintln!("Using cached result from {}", path.display()); Some(content) } Err(_) => None, } } fn write_cache(doi: &str, markdown: &str) { let Some(path) = cache_path(doi) else { return }; if let Some(parent) = path.parent() { let _ = std::fs::create_dir_all(parent); } let _ = std::fs::write(&path, markdown); } // --------------------------------------------------------------------------- // Download // --------------------------------------------------------------------------- fn http_client() -> anyhow::Result { Ok(reqwest::blocking::Client::builder() .user_agent(USER_AGENT) .build()?) } /// Download a paper PDF by DOI. /// /// Tries Unpaywall first (free open-access, requires `UNPAYWALL_EMAIL`). /// Falls back to LibGen (free, no JS challenge), then Anna's Archive if an /// API key is configured via `ANNAS_ARCHIVE_KEY`. fn download_pdf(doi: &str) -> anyhow::Result> { let client = http_client()?; // Try Unpaywall first (free open-access). match download_via_unpaywall(&client, doi) { Ok(bytes) => return Ok(bytes), Err(e) => eprintln!("Unpaywall: {e:#}"), } // Try LibGen. match download_via_libgen(&client, doi) { Ok(bytes) => return Ok(bytes), Err(e) => eprintln!("LibGen failed: {e:#}"), } // Try Anna's Archive fast download API if a key is available. // This requires an MD5 — attempt to resolve one from LibGen even if the // download itself failed (the search may have worked). if let Ok(key) = std::env::var("ANNAS_ARCHIVE_KEY") { if let Some(md5) = resolve_md5_from_libgen(&client, doi) { match download_via_annas_archive(&client, &md5, &key) { Ok(bytes) => return Ok(bytes), Err(e) => eprintln!("Anna's Archive API failed: {e:#}"), } } } bail!("all download sources failed for DOI {doi}") } // -- LibGen ----------------------------------------------------------------- /// Resolve a DOI to a paper MD5 via LibGen search + edition page. fn resolve_md5_from_libgen( client: &reqwest::blocking::Client, doi: &str, ) -> Option { let edition_id = libgen_search(client, doi).ok()?; libgen_edition_md5(client, &edition_id).ok() } /// Download a paper PDF from LibGen by DOI. fn download_via_libgen( client: &reqwest::blocking::Client, doi: &str, ) -> anyhow::Result> { eprintln!("Searching LibGen for DOI {doi}"); let edition_id = libgen_search(client, doi)?; eprintln!("Found edition {edition_id}, resolving download link…"); let md5 = libgen_edition_md5(client, &edition_id)?; let download_key = libgen_download_key(client, &md5)?; let download_url = format!("{LIBGEN_BASE}/get.php?md5={md5}&key={download_key}"); eprintln!("Downloading PDF…"); let bytes = client .get(&download_url) .send() .context("failed to request PDF from LibGen")? .bytes() .context("failed to read PDF body")?; validate_pdf(&bytes)?; Ok(bytes.to_vec()) } /// Search LibGen by DOI and return the first matching edition ID. fn libgen_search( client: &reqwest::blocking::Client, doi: &str, ) -> anyhow::Result { let url = format!("{LIBGEN_BASE}/index.php?req={doi}&topics%5B%5D=a&res=25"); let html = client .get(&url) .send() .context("failed to search LibGen")? .text()?; let doc = Html::parse_document(&html); let sel = Selector::parse("a[href*='edition.php?id=']").expect("valid selector"); for el in doc.select(&sel) { if let Some(href) = el.value().attr("href") { if let Some(id) = href.strip_prefix("edition.php?id=") { return Ok(id.to_string()); } } } bail!("no results found on LibGen for DOI {doi}") } /// Fetch a LibGen edition page and extract the file's MD5. fn libgen_edition_md5( client: &reqwest::blocking::Client, edition_id: &str, ) -> anyhow::Result { let url = format!("{LIBGEN_BASE}/edition.php?id={edition_id}"); let html = client.get(&url).send()?.text()?; let doc = Html::parse_document(&html); let sel = Selector::parse("a[href*='ads.php?md5=']").expect("valid selector"); for el in doc.select(&sel) { if let Some(href) = el.value().attr("href") { if let Some(rest) = href.strip_prefix("ads.php?md5=") { // href may have extra params after the md5 let md5 = rest.split('&').next().unwrap_or(rest); return Ok(md5.to_string()); } if let Some(rest) = href.strip_prefix("/ads.php?md5=") { let md5 = rest.split('&').next().unwrap_or(rest); return Ok(md5.to_string()); } } } bail!("no download link found on edition page {edition_id}") } /// Fetch the LibGen ads/download page for an MD5 and extract the one-time /// download key. fn libgen_download_key( client: &reqwest::blocking::Client, md5: &str, ) -> anyhow::Result { let url = format!("{LIBGEN_BASE}/ads.php?md5={md5}"); let html = client.get(&url).send()?.text()?; let doc = Html::parse_document(&html); let sel = Selector::parse("a[href*='get.php?md5=']").expect("valid selector"); for el in doc.select(&sel) { if let Some(href) = el.value().attr("href") { // Extract key= param from the get.php link if let Some(idx) = href.find("key=") { let key = &href[idx + 4..]; let key = key.split('&').next().unwrap_or(key); if !key.is_empty() { return Ok(key.to_string()); } } } } bail!("no download key found on LibGen ads page for md5 {md5}") } // -- Anna's Archive --------------------------------------------------------- /// Download a paper PDF via the Anna's Archive fast download JSON API. fn download_via_annas_archive( client: &reqwest::blocking::Client, md5: &str, key: &str, ) -> anyhow::Result> { eprintln!("Trying Anna's Archive fast download API…"); let api_url = format!( "https://annas-archive.li/dyn/api/fast_download.json?md5={md5}&key={key}" ); let resp: serde_json::Value = client .get(&api_url) .send() .context("failed to call Anna's Archive API")? .json() .context("failed to parse Anna's Archive API response")?; if let Some(err) = resp.get("error").and_then(|e| e.as_str()) { if !err.is_empty() { bail!("Anna's Archive API error: {err}"); } } let download_url = resp .get("download_url") .and_then(|u| u.as_str()) .context("no download_url in Anna's Archive API response")?; eprintln!("Downloading PDF from Anna's Archive…"); let bytes = client .get(download_url) .send() .context("failed to download from Anna's Archive")? .bytes()?; validate_pdf(&bytes)?; Ok(bytes.to_vec()) } // -- Unpaywall --------------------------------------------------------------- /// Try downloading an open-access PDF via the Unpaywall API. fn download_via_unpaywall( client: &reqwest::blocking::Client, doi: &str, ) -> anyhow::Result> { let email = std::env::var("UNPAYWALL_EMAIL") .context("UNPAYWALL_EMAIL not set")?; eprintln!("Checking Unpaywall for open-access PDF…"); let api_url = format!( "https://api.unpaywall.org/v2/{doi}?email={email}" ); let resp: serde_json::Value = client .get(&api_url) .send() .context("failed to call Unpaywall API")? .error_for_status() .context("Unpaywall API returned an error status")? .json() .context("failed to parse Unpaywall API response")?; let pdf_url = resp .get("best_oa_location") .and_then(|loc| loc.get("url_for_pdf")) .and_then(|u| u.as_str()) .context("no open-access PDF available via Unpaywall")?; eprintln!("Downloading open-access PDF from {pdf_url}"); let bytes = client .get(pdf_url) .send() .context("failed to download from Unpaywall PDF URL")? .error_for_status() .context("Unpaywall PDF URL returned an error status")? .bytes()?; validate_pdf(&bytes)?; Ok(bytes.to_vec()) } // -- Helpers ---------------------------------------------------------------- fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> { if bytes.len() < 1024 { bail!( "downloaded file is suspiciously small ({} bytes) — may not be a valid PDF", bytes.len() ); } Ok(()) } // --------------------------------------------------------------------------- // Conversion // --------------------------------------------------------------------------- /// Check if pdftotext output is usable: long enough and mostly readable text. fn pdftotext_output_is_good(text: &str) -> bool { if text.len() < 500 { return false; } let printable = text.chars().filter(|c| { c.is_alphanumeric() || c.is_whitespace() || c.is_ascii_punctuation() }).count(); let ratio = printable as f64 / text.chars().count() as f64; ratio > 0.8 } /// Try extracting text from a PDF using pdftotext. /// Returns Some(text) if pdftotext succeeds and the output looks good, /// None otherwise. fn try_pdftotext(pdf_bytes: &[u8]) -> Option { let tmp_dir = tempfile::tempdir().ok()?; let pdf_path = tmp_dir.path().join("paper.pdf"); std::fs::write(&pdf_path, pdf_bytes).ok()?; eprintln!("Trying pdftotext…"); let output = Command::new("pdftotext") .arg("-layout") .arg(&pdf_path) .arg("-") .output(); match output { Err(e) if e.kind() == io::ErrorKind::NotFound => { eprintln!("pdftotext not found on PATH, skipping simple extraction"); None } Err(e) => { eprintln!("pdftotext failed: {e}"); None } Ok(o) if !o.status.success() => { eprintln!("pdftotext exited with {}", o.status); None } Ok(o) => { let text = String::from_utf8_lossy(&o.stdout).into_owned(); if pdftotext_output_is_good(&text) { eprintln!("pdftotext output looks good, skipping marker"); Some(text) } else { eprintln!("pdftotext output is low quality, falling back to marker"); None } } } } /// Write PDF bytes to a temp file, run marker_single, and return the /// resulting markdown. fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result { // Try simple extraction first. if let Some(text) = try_pdftotext(pdf_bytes) { return Ok(text); } // Fall back to marker-pdf. let tmp_dir = tempfile::tempdir().context("failed to create temp directory")?; let pdf_path = tmp_dir.path().join("paper.pdf"); let out_dir = tmp_dir.path().join("output"); std::fs::write(&pdf_path, pdf_bytes).context("failed to write temp PDF")?; eprintln!("Converting PDF to markdown with marker…"); let status = Command::new("marker_single") .arg(&pdf_path) .arg("--output_dir") .arg(&out_dir) .arg("--output_format") .arg("markdown") .status(); match status { Err(e) if e.kind() == io::ErrorKind::NotFound => { bail!("marker_single not found on PATH. Install it with:\n pip install marker-pdf"); } Err(e) => bail!("failed to run marker_single: {e}"), Ok(s) if !s.success() => bail!("marker_single exited with {s}"), Ok(_) => {} } // marker_single creates a subdirectory inside our output dir — find // the .md file within it. find_markdown_file(&out_dir) } /// Recursively search a directory for the first .md file and read it. fn find_markdown_file(dir: &std::path::Path) -> anyhow::Result { for entry in std::fs::read_dir(dir).context("failed to read marker output directory")? { let entry = entry?; let path = entry.path(); if path.is_dir() { if let Ok(md) = find_markdown_file(&path) { return Ok(md); } } else if path.extension().is_some_and(|ext| ext == "md") { let mut content = String::new(); std::fs::File::open(&path)?.read_to_string(&mut content)?; return Ok(content); } } bail!("no .md file found in marker output") } #[cfg(test)] mod tests { use super::*; #[test] fn good_text_passes_quality_check() { let text = "This is a normal academic paper abstract. ".repeat(20); assert!(pdftotext_output_is_good(&text)); } #[test] fn short_text_fails_quality_check() { assert!(!pdftotext_output_is_good("too short")); } #[test] fn garbled_text_fails_quality_check() { let garbled = "\u{fffd}\u{25a0}\u{2588}\u{2591}".repeat(200); assert!(!pdftotext_output_is_good(&garbled)); } }