476 lines
15 KiB
Rust
476 lines
15 KiB
Rust
use std::io::{self, Read};
|
|
use std::path::PathBuf;
|
|
use std::process::Command;
|
|
|
|
use anyhow::{Context, bail};
|
|
use clap::Parser;
|
|
use scraper::{Html, Selector};
|
|
|
|
const USER_AGENT: &str =
|
|
"Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0";
|
|
const LIBGEN_BASE: &str = "https://libgen.li";
|
|
|
|
#[derive(Parser)]
|
|
#[command(about = "Download a paper by DOI and convert it to markdown")]
|
|
struct Args {
|
|
/// The DOI of the paper to download
|
|
doi: String,
|
|
|
|
/// Skip the cache and re-download/re-convert
|
|
#[arg(long)]
|
|
no_cache: bool,
|
|
}
|
|
|
|
fn main() -> anyhow::Result<()> {
|
|
let args = Args::parse();
|
|
let doi = args.doi.trim_start_matches("https://doi.org/");
|
|
|
|
if !args.no_cache {
|
|
if let Some(cached) = read_cache(doi) {
|
|
print!("{cached}");
|
|
return Ok(());
|
|
}
|
|
}
|
|
|
|
let pdf_bytes = download_pdf(doi)?;
|
|
let markdown = convert_to_markdown(&pdf_bytes)?;
|
|
|
|
write_cache(doi, &markdown);
|
|
print!("{markdown}");
|
|
Ok(())
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Cache
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/// Return the cache directory: `$XDG_CACHE_HOME/paper` or `~/.cache/paper`.
|
|
fn cache_dir() -> Option<PathBuf> {
|
|
let base = std::env::var_os("XDG_CACHE_HOME")
|
|
.map(PathBuf::from)
|
|
.or_else(|| std::env::var_os("HOME").map(|h| PathBuf::from(h).join(".cache")))?;
|
|
Some(base.join("paper"))
|
|
}
|
|
|
|
/// Path to a cached markdown file for a given DOI.
|
|
/// DOI `10.1038/nature12373` maps to `<cache>/10.1038/nature12373.md`.
|
|
fn cache_path(doi: &str) -> Option<PathBuf> {
|
|
cache_dir().map(|d| d.join(format!("{doi}.md")))
|
|
}
|
|
|
|
fn read_cache(doi: &str) -> Option<String> {
|
|
let path = cache_path(doi)?;
|
|
match std::fs::read_to_string(&path) {
|
|
Ok(content) => {
|
|
eprintln!("Using cached result from {}", path.display());
|
|
Some(content)
|
|
}
|
|
Err(_) => None,
|
|
}
|
|
}
|
|
|
|
fn write_cache(doi: &str, markdown: &str) {
|
|
let Some(path) = cache_path(doi) else { return };
|
|
if let Some(parent) = path.parent() {
|
|
let _ = std::fs::create_dir_all(parent);
|
|
}
|
|
let _ = std::fs::write(&path, markdown);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Download
|
|
// ---------------------------------------------------------------------------
|
|
|
|
fn http_client() -> anyhow::Result<reqwest::blocking::Client> {
|
|
Ok(reqwest::blocking::Client::builder()
|
|
.user_agent(USER_AGENT)
|
|
.build()?)
|
|
}
|
|
|
|
/// Download a paper PDF by DOI.
|
|
///
|
|
/// Tries Unpaywall first (free open-access, requires `UNPAYWALL_EMAIL`).
|
|
/// Falls back to LibGen (free, no JS challenge), then Anna's Archive if an
|
|
/// API key is configured via `ANNAS_ARCHIVE_KEY`.
|
|
fn download_pdf(doi: &str) -> anyhow::Result<Vec<u8>> {
|
|
let client = http_client()?;
|
|
|
|
// Try Unpaywall first (free open-access).
|
|
match download_via_unpaywall(&client, doi) {
|
|
Ok(bytes) => return Ok(bytes),
|
|
Err(e) => eprintln!("Unpaywall: {e:#}"),
|
|
}
|
|
|
|
// Try LibGen.
|
|
match download_via_libgen(&client, doi) {
|
|
Ok(bytes) => return Ok(bytes),
|
|
Err(e) => eprintln!("LibGen failed: {e:#}"),
|
|
}
|
|
|
|
// Try Anna's Archive fast download API if a key is available.
|
|
// This requires an MD5 — attempt to resolve one from LibGen even if the
|
|
// download itself failed (the search may have worked).
|
|
if let Ok(key) = std::env::var("ANNAS_ARCHIVE_KEY") {
|
|
if let Some(md5) = resolve_md5_from_libgen(&client, doi) {
|
|
match download_via_annas_archive(&client, &md5, &key) {
|
|
Ok(bytes) => return Ok(bytes),
|
|
Err(e) => eprintln!("Anna's Archive API failed: {e:#}"),
|
|
}
|
|
}
|
|
}
|
|
|
|
bail!("all download sources failed for DOI {doi}")
|
|
}
|
|
|
|
// -- LibGen -----------------------------------------------------------------
|
|
|
|
/// Resolve a DOI to a paper MD5 via LibGen search + edition page.
|
|
fn resolve_md5_from_libgen(
|
|
client: &reqwest::blocking::Client,
|
|
doi: &str,
|
|
) -> Option<String> {
|
|
let edition_id = libgen_search(client, doi).ok()?;
|
|
libgen_edition_md5(client, &edition_id).ok()
|
|
}
|
|
|
|
/// Download a paper PDF from LibGen by DOI.
|
|
fn download_via_libgen(
|
|
client: &reqwest::blocking::Client,
|
|
doi: &str,
|
|
) -> anyhow::Result<Vec<u8>> {
|
|
eprintln!("Searching LibGen for DOI {doi}");
|
|
let edition_id = libgen_search(client, doi)?;
|
|
|
|
eprintln!("Found edition {edition_id}, resolving download link…");
|
|
let md5 = libgen_edition_md5(client, &edition_id)?;
|
|
let download_key = libgen_download_key(client, &md5)?;
|
|
|
|
let download_url = format!("{LIBGEN_BASE}/get.php?md5={md5}&key={download_key}");
|
|
eprintln!("Downloading PDF…");
|
|
|
|
let bytes = client
|
|
.get(&download_url)
|
|
.send()
|
|
.context("failed to request PDF from LibGen")?
|
|
.bytes()
|
|
.context("failed to read PDF body")?;
|
|
|
|
validate_pdf(&bytes)?;
|
|
Ok(bytes.to_vec())
|
|
}
|
|
|
|
/// Search LibGen by DOI and return the first matching edition ID.
|
|
fn libgen_search(
|
|
client: &reqwest::blocking::Client,
|
|
doi: &str,
|
|
) -> anyhow::Result<String> {
|
|
let url = format!("{LIBGEN_BASE}/index.php?req={doi}&topics%5B%5D=a&res=25");
|
|
let html = client
|
|
.get(&url)
|
|
.send()
|
|
.context("failed to search LibGen")?
|
|
.text()?;
|
|
|
|
let doc = Html::parse_document(&html);
|
|
let sel =
|
|
Selector::parse("a[href*='edition.php?id=']").expect("valid selector");
|
|
|
|
for el in doc.select(&sel) {
|
|
if let Some(href) = el.value().attr("href") {
|
|
if let Some(id) = href.strip_prefix("edition.php?id=") {
|
|
return Ok(id.to_string());
|
|
}
|
|
}
|
|
}
|
|
bail!("no results found on LibGen for DOI {doi}")
|
|
}
|
|
|
|
/// Fetch a LibGen edition page and extract the file's MD5.
|
|
fn libgen_edition_md5(
|
|
client: &reqwest::blocking::Client,
|
|
edition_id: &str,
|
|
) -> anyhow::Result<String> {
|
|
let url = format!("{LIBGEN_BASE}/edition.php?id={edition_id}");
|
|
let html = client.get(&url).send()?.text()?;
|
|
|
|
let doc = Html::parse_document(&html);
|
|
let sel = Selector::parse("a[href*='ads.php?md5=']").expect("valid selector");
|
|
|
|
for el in doc.select(&sel) {
|
|
if let Some(href) = el.value().attr("href") {
|
|
if let Some(rest) = href.strip_prefix("ads.php?md5=") {
|
|
// href may have extra params after the md5
|
|
let md5 = rest.split('&').next().unwrap_or(rest);
|
|
return Ok(md5.to_string());
|
|
}
|
|
if let Some(rest) = href.strip_prefix("/ads.php?md5=") {
|
|
let md5 = rest.split('&').next().unwrap_or(rest);
|
|
return Ok(md5.to_string());
|
|
}
|
|
}
|
|
}
|
|
bail!("no download link found on edition page {edition_id}")
|
|
}
|
|
|
|
/// Fetch the LibGen ads/download page for an MD5 and extract the one-time
|
|
/// download key.
|
|
fn libgen_download_key(
|
|
client: &reqwest::blocking::Client,
|
|
md5: &str,
|
|
) -> anyhow::Result<String> {
|
|
let url = format!("{LIBGEN_BASE}/ads.php?md5={md5}");
|
|
let html = client.get(&url).send()?.text()?;
|
|
|
|
let doc = Html::parse_document(&html);
|
|
let sel = Selector::parse("a[href*='get.php?md5=']").expect("valid selector");
|
|
|
|
for el in doc.select(&sel) {
|
|
if let Some(href) = el.value().attr("href") {
|
|
// Extract key= param from the get.php link
|
|
if let Some(idx) = href.find("key=") {
|
|
let key = &href[idx + 4..];
|
|
let key = key.split('&').next().unwrap_or(key);
|
|
if !key.is_empty() {
|
|
return Ok(key.to_string());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
bail!("no download key found on LibGen ads page for md5 {md5}")
|
|
}
|
|
|
|
// -- Anna's Archive ---------------------------------------------------------
|
|
|
|
/// Download a paper PDF via the Anna's Archive fast download JSON API.
|
|
fn download_via_annas_archive(
|
|
client: &reqwest::blocking::Client,
|
|
md5: &str,
|
|
key: &str,
|
|
) -> anyhow::Result<Vec<u8>> {
|
|
eprintln!("Trying Anna's Archive fast download API…");
|
|
|
|
let api_url = format!(
|
|
"https://annas-archive.li/dyn/api/fast_download.json?md5={md5}&key={key}"
|
|
);
|
|
|
|
let resp: serde_json::Value = client
|
|
.get(&api_url)
|
|
.send()
|
|
.context("failed to call Anna's Archive API")?
|
|
.json()
|
|
.context("failed to parse Anna's Archive API response")?;
|
|
|
|
if let Some(err) = resp.get("error").and_then(|e| e.as_str()) {
|
|
if !err.is_empty() {
|
|
bail!("Anna's Archive API error: {err}");
|
|
}
|
|
}
|
|
|
|
let download_url = resp
|
|
.get("download_url")
|
|
.and_then(|u| u.as_str())
|
|
.context("no download_url in Anna's Archive API response")?;
|
|
|
|
eprintln!("Downloading PDF from Anna's Archive…");
|
|
let bytes = client
|
|
.get(download_url)
|
|
.send()
|
|
.context("failed to download from Anna's Archive")?
|
|
.bytes()?;
|
|
|
|
validate_pdf(&bytes)?;
|
|
Ok(bytes.to_vec())
|
|
}
|
|
|
|
// -- Unpaywall ---------------------------------------------------------------
|
|
|
|
/// Try downloading an open-access PDF via the Unpaywall API.
|
|
fn download_via_unpaywall(
|
|
client: &reqwest::blocking::Client,
|
|
doi: &str,
|
|
) -> anyhow::Result<Vec<u8>> {
|
|
let email = std::env::var("UNPAYWALL_EMAIL")
|
|
.context("UNPAYWALL_EMAIL not set")?;
|
|
|
|
eprintln!("Checking Unpaywall for open-access PDF…");
|
|
|
|
let api_url = format!(
|
|
"https://api.unpaywall.org/v2/{doi}?email={email}"
|
|
);
|
|
|
|
let resp: serde_json::Value = client
|
|
.get(&api_url)
|
|
.send()
|
|
.context("failed to call Unpaywall API")?
|
|
.error_for_status()
|
|
.context("Unpaywall API returned an error status")?
|
|
.json()
|
|
.context("failed to parse Unpaywall API response")?;
|
|
|
|
let pdf_url = resp
|
|
.get("best_oa_location")
|
|
.and_then(|loc| loc.get("url_for_pdf"))
|
|
.and_then(|u| u.as_str())
|
|
.context("no open-access PDF available via Unpaywall")?;
|
|
|
|
eprintln!("Downloading open-access PDF from {pdf_url}");
|
|
let bytes = client
|
|
.get(pdf_url)
|
|
.send()
|
|
.context("failed to download from Unpaywall PDF URL")?
|
|
.error_for_status()
|
|
.context("Unpaywall PDF URL returned an error status")?
|
|
.bytes()?;
|
|
|
|
validate_pdf(&bytes)?;
|
|
Ok(bytes.to_vec())
|
|
}
|
|
|
|
// -- Helpers ----------------------------------------------------------------
|
|
|
|
fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> {
|
|
if bytes.len() < 1024 {
|
|
bail!(
|
|
"downloaded file is suspiciously small ({} bytes) — may not be a valid PDF",
|
|
bytes.len()
|
|
);
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Conversion
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/// Check if pdftotext output is usable: long enough and mostly readable text.
|
|
fn pdftotext_output_is_good(text: &str) -> bool {
|
|
if text.len() < 500 {
|
|
return false;
|
|
}
|
|
let printable = text.chars().filter(|c| {
|
|
c.is_alphanumeric() || c.is_whitespace() || c.is_ascii_punctuation()
|
|
}).count();
|
|
let ratio = printable as f64 / text.chars().count() as f64;
|
|
ratio > 0.8
|
|
}
|
|
|
|
/// Try extracting text from a PDF using pdftotext.
|
|
/// Returns Some(text) if pdftotext succeeds and the output looks good,
|
|
/// None otherwise.
|
|
fn try_pdftotext(pdf_bytes: &[u8]) -> Option<String> {
|
|
let tmp_dir = tempfile::tempdir().ok()?;
|
|
let pdf_path = tmp_dir.path().join("paper.pdf");
|
|
std::fs::write(&pdf_path, pdf_bytes).ok()?;
|
|
|
|
eprintln!("Trying pdftotext…");
|
|
|
|
let output = Command::new("pdftotext")
|
|
.arg("-layout")
|
|
.arg(&pdf_path)
|
|
.arg("-")
|
|
.output();
|
|
|
|
match output {
|
|
Err(e) if e.kind() == io::ErrorKind::NotFound => {
|
|
eprintln!("pdftotext not found on PATH, skipping simple extraction");
|
|
None
|
|
}
|
|
Err(e) => {
|
|
eprintln!("pdftotext failed: {e}");
|
|
None
|
|
}
|
|
Ok(o) if !o.status.success() => {
|
|
eprintln!("pdftotext exited with {}", o.status);
|
|
None
|
|
}
|
|
Ok(o) => {
|
|
let text = String::from_utf8_lossy(&o.stdout).into_owned();
|
|
if pdftotext_output_is_good(&text) {
|
|
eprintln!("pdftotext output looks good, skipping marker");
|
|
Some(text)
|
|
} else {
|
|
eprintln!("pdftotext output is low quality, falling back to marker");
|
|
None
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Write PDF bytes to a temp file, run marker_single, and return the
|
|
/// resulting markdown.
|
|
fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result<String> {
|
|
// Try simple extraction first.
|
|
if let Some(text) = try_pdftotext(pdf_bytes) {
|
|
return Ok(text);
|
|
}
|
|
|
|
// Fall back to marker-pdf.
|
|
let tmp_dir = tempfile::tempdir().context("failed to create temp directory")?;
|
|
let pdf_path = tmp_dir.path().join("paper.pdf");
|
|
let out_dir = tmp_dir.path().join("output");
|
|
|
|
std::fs::write(&pdf_path, pdf_bytes).context("failed to write temp PDF")?;
|
|
|
|
eprintln!("Converting PDF to markdown with marker…");
|
|
|
|
let status = Command::new("marker_single")
|
|
.arg(&pdf_path)
|
|
.arg("--output_dir")
|
|
.arg(&out_dir)
|
|
.arg("--output_format")
|
|
.arg("markdown")
|
|
.status();
|
|
|
|
match status {
|
|
Err(e) if e.kind() == io::ErrorKind::NotFound => {
|
|
bail!("marker_single not found on PATH. Install it with:\n pip install marker-pdf");
|
|
}
|
|
Err(e) => bail!("failed to run marker_single: {e}"),
|
|
Ok(s) if !s.success() => bail!("marker_single exited with {s}"),
|
|
Ok(_) => {}
|
|
}
|
|
|
|
// marker_single creates a subdirectory inside our output dir — find
|
|
// the .md file within it.
|
|
find_markdown_file(&out_dir)
|
|
}
|
|
|
|
/// Recursively search a directory for the first .md file and read it.
|
|
fn find_markdown_file(dir: &std::path::Path) -> anyhow::Result<String> {
|
|
for entry in std::fs::read_dir(dir).context("failed to read marker output directory")? {
|
|
let entry = entry?;
|
|
let path = entry.path();
|
|
if path.is_dir() {
|
|
if let Ok(md) = find_markdown_file(&path) {
|
|
return Ok(md);
|
|
}
|
|
} else if path.extension().is_some_and(|ext| ext == "md") {
|
|
let mut content = String::new();
|
|
std::fs::File::open(&path)?.read_to_string(&mut content)?;
|
|
return Ok(content);
|
|
}
|
|
}
|
|
bail!("no .md file found in marker output")
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn good_text_passes_quality_check() {
|
|
let text = "This is a normal academic paper abstract. ".repeat(20);
|
|
assert!(pdftotext_output_is_good(&text));
|
|
}
|
|
|
|
#[test]
|
|
fn short_text_fails_quality_check() {
|
|
assert!(!pdftotext_output_is_good("too short"));
|
|
}
|
|
|
|
#[test]
|
|
fn garbled_text_fails_quality_check() {
|
|
let garbled = "\u{fffd}\u{25a0}\u{2588}\u{2591}".repeat(200);
|
|
assert!(!pdftotext_output_is_good(&garbled));
|
|
}
|
|
}
|