From 21a61b1c75bab5e2f45504478ed7bca29d4b3fc0 Mon Sep 17 00:00:00 2001 From: Ellie <6687206+wizzeh@users.noreply.github.com> Date: Wed, 25 Feb 2026 13:30:14 -0800 Subject: [PATCH] feat: try Unpaywall for open-access PDFs before LibGen Co-Authored-By: Claude Opus 4.6 --- src/main.rs | 58 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/src/main.rs b/src/main.rs index bd75521..59a6759 100644 --- a/src/main.rs +++ b/src/main.rs @@ -89,13 +89,19 @@ fn http_client() -> anyhow::Result { /// Download a paper PDF by DOI. /// -/// Tries LibGen first (free, no JS challenge). If that fails and an Anna's -/// Archive API key is configured via `ANNAS_ARCHIVE_KEY`, tries the fast -/// download API as a fallback. +/// Tries Unpaywall first (free open-access, requires `UNPAYWALL_EMAIL`). +/// Falls back to LibGen (free, no JS challenge), then Anna's Archive if an +/// API key is configured via `ANNAS_ARCHIVE_KEY`. fn download_pdf(doi: &str) -> anyhow::Result> { let client = http_client()?; - // Try LibGen first. + // Try Unpaywall first (free open-access). + match download_via_unpaywall(&client, doi) { + Ok(bytes) => return Ok(bytes), + Err(e) => eprintln!("Unpaywall: {e:#}"), + } + + // Try LibGen. match download_via_libgen(&client, doi) { Ok(bytes) => return Ok(bytes), Err(e) => eprintln!("LibGen failed: {e:#}"), @@ -276,6 +282,50 @@ fn download_via_annas_archive( Ok(bytes.to_vec()) } +// -- Unpaywall --------------------------------------------------------------- + +/// Try downloading an open-access PDF via the Unpaywall API. +fn download_via_unpaywall( + client: &reqwest::blocking::Client, + doi: &str, +) -> anyhow::Result> { + let email = std::env::var("UNPAYWALL_EMAIL") + .context("UNPAYWALL_EMAIL not set")?; + + eprintln!("Checking Unpaywall for open-access PDF…"); + + let api_url = format!( + "https://api.unpaywall.org/v2/{doi}?email={email}" + ); + + let resp: serde_json::Value = client + .get(&api_url) + .send() + .context("failed to call Unpaywall API")? + .error_for_status() + .context("Unpaywall API returned an error status")? + .json() + .context("failed to parse Unpaywall API response")?; + + let pdf_url = resp + .get("best_oa_location") + .and_then(|loc| loc.get("url_for_pdf")) + .and_then(|u| u.as_str()) + .context("no open-access PDF available via Unpaywall")?; + + eprintln!("Downloading open-access PDF from {pdf_url}"); + let bytes = client + .get(pdf_url) + .send() + .context("failed to download from Unpaywall PDF URL")? + .error_for_status() + .context("Unpaywall PDF URL returned an error status")? + .bytes()?; + + validate_pdf(&bytes)?; + Ok(bytes.to_vec()) +} + // -- Helpers ---------------------------------------------------------------- fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> {