paper-reader/src/main.rs
Ellie ff29d6109d feat: try pdftotext before marker-pdf for conversion
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-25 13:25:50 -08:00

426 lines
13 KiB
Rust

use std::io::{self, Read};
use std::path::PathBuf;
use std::process::Command;
use anyhow::{Context, bail};
use clap::Parser;
use scraper::{Html, Selector};
const USER_AGENT: &str =
"Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0";
const LIBGEN_BASE: &str = "https://libgen.li";
#[derive(Parser)]
#[command(about = "Download a paper by DOI and convert it to markdown")]
struct Args {
/// The DOI of the paper to download
doi: String,
/// Skip the cache and re-download/re-convert
#[arg(long)]
no_cache: bool,
}
fn main() -> anyhow::Result<()> {
let args = Args::parse();
let doi = args.doi.trim_start_matches("https://doi.org/");
if !args.no_cache {
if let Some(cached) = read_cache(doi) {
print!("{cached}");
return Ok(());
}
}
let pdf_bytes = download_pdf(doi)?;
let markdown = convert_to_markdown(&pdf_bytes)?;
write_cache(doi, &markdown);
print!("{markdown}");
Ok(())
}
// ---------------------------------------------------------------------------
// Cache
// ---------------------------------------------------------------------------
/// Return the cache directory: `$XDG_CACHE_HOME/paper` or `~/.cache/paper`.
fn cache_dir() -> Option<PathBuf> {
let base = std::env::var_os("XDG_CACHE_HOME")
.map(PathBuf::from)
.or_else(|| std::env::var_os("HOME").map(|h| PathBuf::from(h).join(".cache")))?;
Some(base.join("paper"))
}
/// Path to a cached markdown file for a given DOI.
/// DOI `10.1038/nature12373` maps to `<cache>/10.1038/nature12373.md`.
fn cache_path(doi: &str) -> Option<PathBuf> {
cache_dir().map(|d| d.join(format!("{doi}.md")))
}
fn read_cache(doi: &str) -> Option<String> {
let path = cache_path(doi)?;
match std::fs::read_to_string(&path) {
Ok(content) => {
eprintln!("Using cached result from {}", path.display());
Some(content)
}
Err(_) => None,
}
}
fn write_cache(doi: &str, markdown: &str) {
let Some(path) = cache_path(doi) else { return };
if let Some(parent) = path.parent() {
let _ = std::fs::create_dir_all(parent);
}
let _ = std::fs::write(&path, markdown);
}
// ---------------------------------------------------------------------------
// Download
// ---------------------------------------------------------------------------
fn http_client() -> anyhow::Result<reqwest::blocking::Client> {
Ok(reqwest::blocking::Client::builder()
.user_agent(USER_AGENT)
.build()?)
}
/// Download a paper PDF by DOI.
///
/// Tries LibGen first (free, no JS challenge). If that fails and an Anna's
/// Archive API key is configured via `ANNAS_ARCHIVE_KEY`, tries the fast
/// download API as a fallback.
fn download_pdf(doi: &str) -> anyhow::Result<Vec<u8>> {
let client = http_client()?;
// Try LibGen first.
match download_via_libgen(&client, doi) {
Ok(bytes) => return Ok(bytes),
Err(e) => eprintln!("LibGen failed: {e:#}"),
}
// Try Anna's Archive fast download API if a key is available.
// This requires an MD5 — attempt to resolve one from LibGen even if the
// download itself failed (the search may have worked).
if let Ok(key) = std::env::var("ANNAS_ARCHIVE_KEY") {
if let Some(md5) = resolve_md5_from_libgen(&client, doi) {
match download_via_annas_archive(&client, &md5, &key) {
Ok(bytes) => return Ok(bytes),
Err(e) => eprintln!("Anna's Archive API failed: {e:#}"),
}
}
}
bail!("all download sources failed for DOI {doi}")
}
// -- LibGen -----------------------------------------------------------------
/// Resolve a DOI to a paper MD5 via LibGen search + edition page.
fn resolve_md5_from_libgen(
client: &reqwest::blocking::Client,
doi: &str,
) -> Option<String> {
let edition_id = libgen_search(client, doi).ok()?;
libgen_edition_md5(client, &edition_id).ok()
}
/// Download a paper PDF from LibGen by DOI.
fn download_via_libgen(
client: &reqwest::blocking::Client,
doi: &str,
) -> anyhow::Result<Vec<u8>> {
eprintln!("Searching LibGen for DOI {doi}");
let edition_id = libgen_search(client, doi)?;
eprintln!("Found edition {edition_id}, resolving download link…");
let md5 = libgen_edition_md5(client, &edition_id)?;
let download_key = libgen_download_key(client, &md5)?;
let download_url = format!("{LIBGEN_BASE}/get.php?md5={md5}&key={download_key}");
eprintln!("Downloading PDF…");
let bytes = client
.get(&download_url)
.send()
.context("failed to request PDF from LibGen")?
.bytes()
.context("failed to read PDF body")?;
validate_pdf(&bytes)?;
Ok(bytes.to_vec())
}
/// Search LibGen by DOI and return the first matching edition ID.
fn libgen_search(
client: &reqwest::blocking::Client,
doi: &str,
) -> anyhow::Result<String> {
let url = format!("{LIBGEN_BASE}/index.php?req={doi}&topics%5B%5D=a&res=25");
let html = client
.get(&url)
.send()
.context("failed to search LibGen")?
.text()?;
let doc = Html::parse_document(&html);
let sel =
Selector::parse("a[href*='edition.php?id=']").expect("valid selector");
for el in doc.select(&sel) {
if let Some(href) = el.value().attr("href") {
if let Some(id) = href.strip_prefix("edition.php?id=") {
return Ok(id.to_string());
}
}
}
bail!("no results found on LibGen for DOI {doi}")
}
/// Fetch a LibGen edition page and extract the file's MD5.
fn libgen_edition_md5(
client: &reqwest::blocking::Client,
edition_id: &str,
) -> anyhow::Result<String> {
let url = format!("{LIBGEN_BASE}/edition.php?id={edition_id}");
let html = client.get(&url).send()?.text()?;
let doc = Html::parse_document(&html);
let sel = Selector::parse("a[href*='ads.php?md5=']").expect("valid selector");
for el in doc.select(&sel) {
if let Some(href) = el.value().attr("href") {
if let Some(rest) = href.strip_prefix("ads.php?md5=") {
// href may have extra params after the md5
let md5 = rest.split('&').next().unwrap_or(rest);
return Ok(md5.to_string());
}
if let Some(rest) = href.strip_prefix("/ads.php?md5=") {
let md5 = rest.split('&').next().unwrap_or(rest);
return Ok(md5.to_string());
}
}
}
bail!("no download link found on edition page {edition_id}")
}
/// Fetch the LibGen ads/download page for an MD5 and extract the one-time
/// download key.
fn libgen_download_key(
client: &reqwest::blocking::Client,
md5: &str,
) -> anyhow::Result<String> {
let url = format!("{LIBGEN_BASE}/ads.php?md5={md5}");
let html = client.get(&url).send()?.text()?;
let doc = Html::parse_document(&html);
let sel = Selector::parse("a[href*='get.php?md5=']").expect("valid selector");
for el in doc.select(&sel) {
if let Some(href) = el.value().attr("href") {
// Extract key= param from the get.php link
if let Some(idx) = href.find("key=") {
let key = &href[idx + 4..];
let key = key.split('&').next().unwrap_or(key);
if !key.is_empty() {
return Ok(key.to_string());
}
}
}
}
bail!("no download key found on LibGen ads page for md5 {md5}")
}
// -- Anna's Archive ---------------------------------------------------------
/// Download a paper PDF via the Anna's Archive fast download JSON API.
fn download_via_annas_archive(
client: &reqwest::blocking::Client,
md5: &str,
key: &str,
) -> anyhow::Result<Vec<u8>> {
eprintln!("Trying Anna's Archive fast download API…");
let api_url = format!(
"https://annas-archive.li/dyn/api/fast_download.json?md5={md5}&key={key}"
);
let resp: serde_json::Value = client
.get(&api_url)
.send()
.context("failed to call Anna's Archive API")?
.json()
.context("failed to parse Anna's Archive API response")?;
if let Some(err) = resp.get("error").and_then(|e| e.as_str()) {
if !err.is_empty() {
bail!("Anna's Archive API error: {err}");
}
}
let download_url = resp
.get("download_url")
.and_then(|u| u.as_str())
.context("no download_url in Anna's Archive API response")?;
eprintln!("Downloading PDF from Anna's Archive…");
let bytes = client
.get(download_url)
.send()
.context("failed to download from Anna's Archive")?
.bytes()?;
validate_pdf(&bytes)?;
Ok(bytes.to_vec())
}
// -- Helpers ----------------------------------------------------------------
fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> {
if bytes.len() < 1024 {
bail!(
"downloaded file is suspiciously small ({} bytes) — may not be a valid PDF",
bytes.len()
);
}
Ok(())
}
// ---------------------------------------------------------------------------
// Conversion
// ---------------------------------------------------------------------------
/// Check if pdftotext output is usable: long enough and mostly readable text.
fn pdftotext_output_is_good(text: &str) -> bool {
if text.len() < 500 {
return false;
}
let printable = text.chars().filter(|c| {
c.is_alphanumeric() || c.is_whitespace() || c.is_ascii_punctuation()
}).count();
let ratio = printable as f64 / text.chars().count() as f64;
ratio > 0.8
}
/// Try extracting text from a PDF using pdftotext.
/// Returns Some(text) if pdftotext succeeds and the output looks good,
/// None otherwise.
fn try_pdftotext(pdf_bytes: &[u8]) -> Option<String> {
let tmp_dir = tempfile::tempdir().ok()?;
let pdf_path = tmp_dir.path().join("paper.pdf");
std::fs::write(&pdf_path, pdf_bytes).ok()?;
eprintln!("Trying pdftotext…");
let output = Command::new("pdftotext")
.arg("-layout")
.arg(&pdf_path)
.arg("-")
.output();
match output {
Err(e) if e.kind() == io::ErrorKind::NotFound => {
eprintln!("pdftotext not found on PATH, skipping simple extraction");
None
}
Err(e) => {
eprintln!("pdftotext failed: {e}");
None
}
Ok(o) if !o.status.success() => {
eprintln!("pdftotext exited with {}", o.status);
None
}
Ok(o) => {
let text = String::from_utf8_lossy(&o.stdout).into_owned();
if pdftotext_output_is_good(&text) {
eprintln!("pdftotext output looks good, skipping marker");
Some(text)
} else {
eprintln!("pdftotext output is low quality, falling back to marker");
None
}
}
}
}
/// Write PDF bytes to a temp file, run marker_single, and return the
/// resulting markdown.
fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result<String> {
// Try simple extraction first.
if let Some(text) = try_pdftotext(pdf_bytes) {
return Ok(text);
}
// Fall back to marker-pdf.
let tmp_dir = tempfile::tempdir().context("failed to create temp directory")?;
let pdf_path = tmp_dir.path().join("paper.pdf");
let out_dir = tmp_dir.path().join("output");
std::fs::write(&pdf_path, pdf_bytes).context("failed to write temp PDF")?;
eprintln!("Converting PDF to markdown with marker…");
let status = Command::new("marker_single")
.arg(&pdf_path)
.arg("--output_dir")
.arg(&out_dir)
.arg("--output_format")
.arg("markdown")
.status();
match status {
Err(e) if e.kind() == io::ErrorKind::NotFound => {
bail!("marker_single not found on PATH. Install it with:\n pip install marker-pdf");
}
Err(e) => bail!("failed to run marker_single: {e}"),
Ok(s) if !s.success() => bail!("marker_single exited with {s}"),
Ok(_) => {}
}
// marker_single creates a subdirectory inside our output dir — find
// the .md file within it.
find_markdown_file(&out_dir)
}
/// Recursively search a directory for the first .md file and read it.
fn find_markdown_file(dir: &std::path::Path) -> anyhow::Result<String> {
for entry in std::fs::read_dir(dir).context("failed to read marker output directory")? {
let entry = entry?;
let path = entry.path();
if path.is_dir() {
if let Ok(md) = find_markdown_file(&path) {
return Ok(md);
}
} else if path.extension().is_some_and(|ext| ext == "md") {
let mut content = String::new();
std::fs::File::open(&path)?.read_to_string(&mut content)?;
return Ok(content);
}
}
bail!("no .md file found in marker output")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn good_text_passes_quality_check() {
let text = "This is a normal academic paper abstract. ".repeat(20);
assert!(pdftotext_output_is_good(&text));
}
#[test]
fn short_text_fails_quality_check() {
assert!(!pdftotext_output_is_good("too short"));
}
#[test]
fn garbled_text_fails_quality_check() {
let garbled = "\u{fffd}\u{25a0}\u{2588}\u{2591}".repeat(200);
assert!(!pdftotext_output_is_good(&garbled));
}
}