paper-reader/src/main.rs

use std::io::{self, Read};
use std::path::PathBuf;
use std::process::Command;

use anyhow::{Context, bail};
use clap::Parser;
use scraper::{Html, Selector};

const USER_AGENT: &str =
    "Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0";
const LIBGEN_BASE: &str = "https://libgen.li";

#[derive(Parser)]
#[command(about = "Download a paper by DOI and convert it to markdown")]
struct Args {
    /// The DOI of the paper to download
    doi: String,

    /// Skip the cache and re-download/re-convert
    #[arg(long)]
    no_cache: bool,
}

fn main() -> anyhow::Result<()> {
    let args = Args::parse();
    let doi = args.doi.trim_start_matches("https://doi.org/");

    if !args.no_cache {
        if let Some(cached) = read_cache(doi) {
            print!("{cached}");
            return Ok(());
        }
    }

    let pdf_bytes = download_pdf(doi)?;
    let markdown = convert_to_markdown(&pdf_bytes)?;

    write_cache(doi, &markdown);
    print!("{markdown}");
    Ok(())
}

// ---------------------------------------------------------------------------
// Cache
// ---------------------------------------------------------------------------

/// Return the cache directory: `$XDG_CACHE_HOME/paper` or `~/.cache/paper`.
fn cache_dir() -> Option<PathBuf> {
    let base = std::env::var_os("XDG_CACHE_HOME")
        .map(PathBuf::from)
        .or_else(|| std::env::var_os("HOME").map(|h| PathBuf::from(h).join(".cache")))?;
    Some(base.join("paper"))
}

/// Path to a cached markdown file for a given DOI.
/// DOI `10.1038/nature12373` maps to `<cache>/10.1038/nature12373.md`.
fn cache_path(doi: &str) -> Option<PathBuf> {
    cache_dir().map(|d| d.join(format!("{doi}.md")))
}

fn read_cache(doi: &str) -> Option<String> {
    let path = cache_path(doi)?;
    match std::fs::read_to_string(&path) {
        Ok(content) => {
            eprintln!("Using cached result from {}", path.display());
            Some(content)
        }
        Err(_) => None,
    }
}

fn write_cache(doi: &str, markdown: &str) {
    let Some(path) = cache_path(doi) else { return };
    if let Some(parent) = path.parent() {
        let _ = std::fs::create_dir_all(parent);
    }
    let _ = std::fs::write(&path, markdown);
}

// ---------------------------------------------------------------------------
// Download
// ---------------------------------------------------------------------------

fn http_client() -> anyhow::Result<reqwest::blocking::Client> {
    Ok(reqwest::blocking::Client::builder()
        .user_agent(USER_AGENT)
        .build()?)
}

/// Download a paper PDF by DOI.
///
/// Tries Unpaywall first (free open-access, requires `UNPAYWALL_EMAIL`).
/// Falls back to LibGen (free, no JS challenge), then Anna's Archive if an
/// API key is configured via `ANNAS_ARCHIVE_KEY`.
fn download_pdf(doi: &str) -> anyhow::Result<Vec<u8>> {
    let client = http_client()?;

    // Try Unpaywall first (free open-access).
    match download_via_unpaywall(&client, doi) {
        Ok(bytes) => return Ok(bytes),
        Err(e) => eprintln!("Unpaywall: {e:#}"),
    }

    // Try LibGen.
    match download_via_libgen(&client, doi) {
        Ok(bytes) => return Ok(bytes),
        Err(e) => eprintln!("LibGen failed: {e:#}"),
    }

    // Try Anna's Archive fast download API if a key is available.
    // This requires an MD5 — attempt to resolve one from LibGen even if the
    // download itself failed (the search may have worked).
    if let Ok(key) = std::env::var("ANNAS_ARCHIVE_KEY") {
        if let Some(md5) = resolve_md5_from_libgen(&client, doi) {
            match download_via_annas_archive(&client, &md5, &key) {
                Ok(bytes) => return Ok(bytes),
                Err(e) => eprintln!("Anna's Archive API failed: {e:#}"),
            }
        }
    }

    bail!("all download sources failed for DOI {doi}")
}

// -- LibGen -----------------------------------------------------------------

/// Resolve a DOI to a paper MD5 via LibGen search + edition page.
fn resolve_md5_from_libgen(
    client: &reqwest::blocking::Client,
    doi: &str,
) -> Option<String> {
    let edition_id = libgen_search(client, doi).ok()?;
    libgen_edition_md5(client, &edition_id).ok()
}

/// Download a paper PDF from LibGen by DOI.
fn download_via_libgen(
    client: &reqwest::blocking::Client,
    doi: &str,
) -> anyhow::Result<Vec<u8>> {
    eprintln!("Searching LibGen for DOI {doi}");
    let edition_id = libgen_search(client, doi)?;

    eprintln!("Found edition {edition_id}, resolving download link…");
    let md5 = libgen_edition_md5(client, &edition_id)?;
    let download_key = libgen_download_key(client, &md5)?;

    let download_url = format!("{LIBGEN_BASE}/get.php?md5={md5}&key={download_key}");
    eprintln!("Downloading PDF…");

    let bytes = client
        .get(&download_url)
        .send()
        .context("failed to request PDF from LibGen")?
        .bytes()
        .context("failed to read PDF body")?;

    validate_pdf(&bytes)?;
    Ok(bytes.to_vec())
}

/// Search LibGen by DOI and return the first matching edition ID.
fn libgen_search(
    client: &reqwest::blocking::Client,
    doi: &str,
) -> anyhow::Result<String> {
    let url = format!("{LIBGEN_BASE}/index.php?req={doi}&topics%5B%5D=a&res=25");
    let html = client
        .get(&url)
        .send()
        .context("failed to search LibGen")?
        .text()?;

    let doc = Html::parse_document(&html);
    let sel =
        Selector::parse("a[href*='edition.php?id=']").expect("valid selector");

    for el in doc.select(&sel) {
        if let Some(href) = el.value().attr("href") {
            if let Some(id) = href.strip_prefix("edition.php?id=") {
                return Ok(id.to_string());
            }
        }
    }
    bail!("no results found on LibGen for DOI {doi}")
}

/// Fetch a LibGen edition page and extract the file's MD5.
fn libgen_edition_md5(
    client: &reqwest::blocking::Client,
    edition_id: &str,
) -> anyhow::Result<String> {
    let url = format!("{LIBGEN_BASE}/edition.php?id={edition_id}");
    let html = client.get(&url).send()?.text()?;

    let doc = Html::parse_document(&html);
    let sel = Selector::parse("a[href*='ads.php?md5=']").expect("valid selector");

    for el in doc.select(&sel) {
        if let Some(href) = el.value().attr("href") {
            if let Some(rest) = href.strip_prefix("ads.php?md5=") {
                // href may have extra params after the md5
                let md5 = rest.split('&').next().unwrap_or(rest);
                return Ok(md5.to_string());
            }
            if let Some(rest) = href.strip_prefix("/ads.php?md5=") {
                let md5 = rest.split('&').next().unwrap_or(rest);
                return Ok(md5.to_string());
            }
        }
    }
    bail!("no download link found on edition page {edition_id}")
}

/// Fetch the LibGen ads/download page for an MD5 and extract the one-time
/// download key.
fn libgen_download_key(
    client: &reqwest::blocking::Client,
    md5: &str,
) -> anyhow::Result<String> {
    let url = format!("{LIBGEN_BASE}/ads.php?md5={md5}");
    let html = client.get(&url).send()?.text()?;

    let doc = Html::parse_document(&html);
    let sel = Selector::parse("a[href*='get.php?md5=']").expect("valid selector");

    for el in doc.select(&sel) {
        if let Some(href) = el.value().attr("href") {
            // Extract key= param from the get.php link
            if let Some(idx) = href.find("key=") {
                let key = &href[idx + 4..];
                let key = key.split('&').next().unwrap_or(key);
                if !key.is_empty() {
                    return Ok(key.to_string());
                }
            }
        }
    }
    bail!("no download key found on LibGen ads page for md5 {md5}")
}

// -- Anna's Archive ---------------------------------------------------------

/// Download a paper PDF via the Anna's Archive fast download JSON API.
fn download_via_annas_archive(
    client: &reqwest::blocking::Client,
    md5: &str,
    key: &str,
) -> anyhow::Result<Vec<u8>> {
    eprintln!("Trying Anna's Archive fast download API…");

    let api_url = format!(
        "https://annas-archive.li/dyn/api/fast_download.json?md5={md5}&key={key}"
    );

    let resp: serde_json::Value = client
        .get(&api_url)
        .send()
        .context("failed to call Anna's Archive API")?
        .json()
        .context("failed to parse Anna's Archive API response")?;

    if let Some(err) = resp.get("error").and_then(|e| e.as_str()) {
        if !err.is_empty() {
            bail!("Anna's Archive API error: {err}");
        }
    }

    let download_url = resp
        .get("download_url")
        .and_then(|u| u.as_str())
        .context("no download_url in Anna's Archive API response")?;

    eprintln!("Downloading PDF from Anna's Archive…");
    let bytes = client
        .get(download_url)
        .send()
        .context("failed to download from Anna's Archive")?
        .bytes()?;

    validate_pdf(&bytes)?;
    Ok(bytes.to_vec())
}

// -- Unpaywall ---------------------------------------------------------------

/// Try downloading an open-access PDF via the Unpaywall API.
fn download_via_unpaywall(
    client: &reqwest::blocking::Client,
    doi: &str,
) -> anyhow::Result<Vec<u8>> {
    let email = std::env::var("UNPAYWALL_EMAIL")
        .context("UNPAYWALL_EMAIL not set")?;

    eprintln!("Checking Unpaywall for open-access PDF…");

    let api_url = format!(
        "https://api.unpaywall.org/v2/{doi}?email={email}"
    );

    let resp: serde_json::Value = client
        .get(&api_url)
        .send()
        .context("failed to call Unpaywall API")?
        .error_for_status()
        .context("Unpaywall API returned an error status")?
        .json()
        .context("failed to parse Unpaywall API response")?;

    let pdf_url = resp
        .get("best_oa_location")
        .and_then(|loc| loc.get("url_for_pdf"))
        .and_then(|u| u.as_str())
        .context("no open-access PDF available via Unpaywall")?;

    eprintln!("Downloading open-access PDF from {pdf_url}");
    let bytes = client
        .get(pdf_url)
        .send()
        .context("failed to download from Unpaywall PDF URL")?
        .error_for_status()
        .context("Unpaywall PDF URL returned an error status")?
        .bytes()?;

    validate_pdf(&bytes)?;
    Ok(bytes.to_vec())
}

// -- Helpers ----------------------------------------------------------------

fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> {
    if bytes.len() < 1024 {
        bail!(
            "downloaded file is suspiciously small ({} bytes) — may not be a valid PDF",
            bytes.len()
        );
    }
    Ok(())
}

// ---------------------------------------------------------------------------
// Conversion
// ---------------------------------------------------------------------------

/// Check if pdftotext output is usable: long enough and mostly readable text.
fn pdftotext_output_is_good(text: &str) -> bool {
    if text.len() < 500 {
        return false;
    }
    let printable = text.chars().filter(|c| {
        c.is_alphanumeric() || c.is_whitespace() || c.is_ascii_punctuation()
    }).count();
    let ratio = printable as f64 / text.chars().count() as f64;
    ratio > 0.8
}

/// Try extracting text from a PDF using pdftotext.
/// Returns Some(text) if pdftotext succeeds and the output looks good,
/// None otherwise.
fn try_pdftotext(pdf_bytes: &[u8]) -> Option<String> {
    let tmp_dir = tempfile::tempdir().ok()?;
    let pdf_path = tmp_dir.path().join("paper.pdf");
    std::fs::write(&pdf_path, pdf_bytes).ok()?;

    eprintln!("Trying pdftotext…");

    let output = Command::new("pdftotext")
        .arg("-layout")
        .arg(&pdf_path)
        .arg("-")
        .output();

    match output {
        Err(e) if e.kind() == io::ErrorKind::NotFound => {
            eprintln!("pdftotext not found on PATH, skipping simple extraction");
            None
        }
        Err(e) => {
            eprintln!("pdftotext failed: {e}");
            None
        }
        Ok(o) if !o.status.success() => {
            eprintln!("pdftotext exited with {}", o.status);
            None
        }
        Ok(o) => {
            let text = String::from_utf8_lossy(&o.stdout).into_owned();
            if pdftotext_output_is_good(&text) {
                eprintln!("pdftotext output looks good, skipping marker");
                Some(text)
            } else {
                eprintln!("pdftotext output is low quality, falling back to marker");
                None
            }
        }
    }
}

/// Write PDF bytes to a temp file, run marker_single, and return the
/// resulting markdown.
fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result<String> {
    // Try simple extraction first.
    if let Some(text) = try_pdftotext(pdf_bytes) {
        return Ok(text);
    }

    // Fall back to marker-pdf.
    let tmp_dir = tempfile::tempdir().context("failed to create temp directory")?;
    let pdf_path = tmp_dir.path().join("paper.pdf");
    let out_dir = tmp_dir.path().join("output");

    std::fs::write(&pdf_path, pdf_bytes).context("failed to write temp PDF")?;

    eprintln!("Converting PDF to markdown with marker…");

    let status = Command::new("marker_single")
        .arg(&pdf_path)
        .arg("--output_dir")
        .arg(&out_dir)
        .arg("--output_format")
        .arg("markdown")
        .status();

    match status {
        Err(e) if e.kind() == io::ErrorKind::NotFound => {
            bail!("marker_single not found on PATH. Install it with:\n  pip install marker-pdf");
        }
        Err(e) => bail!("failed to run marker_single: {e}"),
        Ok(s) if !s.success() => bail!("marker_single exited with {s}"),
        Ok(_) => {}
    }

    // marker_single creates a subdirectory inside our output dir — find
    // the .md file within it.
    find_markdown_file(&out_dir)
}

/// Recursively search a directory for the first .md file and read it.
fn find_markdown_file(dir: &std::path::Path) -> anyhow::Result<String> {
    for entry in std::fs::read_dir(dir).context("failed to read marker output directory")? {
        let entry = entry?;
        let path = entry.path();
        if path.is_dir() {
            if let Ok(md) = find_markdown_file(&path) {
                return Ok(md);
            }
        } else if path.extension().is_some_and(|ext| ext == "md") {
            let mut content = String::new();
            std::fs::File::open(&path)?.read_to_string(&mut content)?;
            return Ok(content);
        }
    }
    bail!("no .md file found in marker output")
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn good_text_passes_quality_check() {
        let text = "This is a normal academic paper abstract. ".repeat(20);
        assert!(pdftotext_output_is_good(&text));
    }

    #[test]
    fn short_text_fails_quality_check() {
        assert!(!pdftotext_output_is_good("too short"));
    }

    #[test]
    fn garbled_text_fails_quality_check() {
        let garbled = "\u{fffd}\u{25a0}\u{2588}\u{2591}".repeat(200);
        assert!(!pdftotext_output_is_good(&garbled));
    }
}