feat: try Unpaywall for open-access PDFs before LibGen

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Ellie 2026-02-25 13:30:14 -08:00
parent ff29d6109d
commit 21a61b1c75

View file

@ -89,13 +89,19 @@ fn http_client() -> anyhow::Result<reqwest::blocking::Client> {
/// Download a paper PDF by DOI.
///
/// Tries LibGen first (free, no JS challenge). If that fails and an Anna's
/// Archive API key is configured via `ANNAS_ARCHIVE_KEY`, tries the fast
/// download API as a fallback.
/// Tries Unpaywall first (free open-access, requires `UNPAYWALL_EMAIL`).
/// Falls back to LibGen (free, no JS challenge), then Anna's Archive if an
/// API key is configured via `ANNAS_ARCHIVE_KEY`.
fn download_pdf(doi: &str) -> anyhow::Result<Vec<u8>> {
let client = http_client()?;
// Try LibGen first.
// Try Unpaywall first (free open-access).
match download_via_unpaywall(&client, doi) {
Ok(bytes) => return Ok(bytes),
Err(e) => eprintln!("Unpaywall: {e:#}"),
}
// Try LibGen.
match download_via_libgen(&client, doi) {
Ok(bytes) => return Ok(bytes),
Err(e) => eprintln!("LibGen failed: {e:#}"),
@ -276,6 +282,50 @@ fn download_via_annas_archive(
Ok(bytes.to_vec())
}
// -- Unpaywall ---------------------------------------------------------------
/// Try downloading an open-access PDF via the Unpaywall API.
fn download_via_unpaywall(
client: &reqwest::blocking::Client,
doi: &str,
) -> anyhow::Result<Vec<u8>> {
let email = std::env::var("UNPAYWALL_EMAIL")
.context("UNPAYWALL_EMAIL not set")?;
eprintln!("Checking Unpaywall for open-access PDF…");
let api_url = format!(
"https://api.unpaywall.org/v2/{doi}?email={email}"
);
let resp: serde_json::Value = client
.get(&api_url)
.send()
.context("failed to call Unpaywall API")?
.error_for_status()
.context("Unpaywall API returned an error status")?
.json()
.context("failed to parse Unpaywall API response")?;
let pdf_url = resp
.get("best_oa_location")
.and_then(|loc| loc.get("url_for_pdf"))
.and_then(|u| u.as_str())
.context("no open-access PDF available via Unpaywall")?;
eprintln!("Downloading open-access PDF from {pdf_url}");
let bytes = client
.get(pdf_url)
.send()
.context("failed to download from Unpaywall PDF URL")?
.error_for_status()
.context("Unpaywall PDF URL returned an error status")?
.bytes()?;
validate_pdf(&bytes)?;
Ok(bytes.to_vec())
}
// -- Helpers ----------------------------------------------------------------
fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> {