feat: try Unpaywall for open-access PDFs before LibGen
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
ff29d6109d
commit
21a61b1c75
1 changed files with 54 additions and 4 deletions
58
src/main.rs
58
src/main.rs
|
|
@ -89,13 +89,19 @@ fn http_client() -> anyhow::Result<reqwest::blocking::Client> {
|
||||||
|
|
||||||
/// Download a paper PDF by DOI.
|
/// Download a paper PDF by DOI.
|
||||||
///
|
///
|
||||||
/// Tries LibGen first (free, no JS challenge). If that fails and an Anna's
|
/// Tries Unpaywall first (free open-access, requires `UNPAYWALL_EMAIL`).
|
||||||
/// Archive API key is configured via `ANNAS_ARCHIVE_KEY`, tries the fast
|
/// Falls back to LibGen (free, no JS challenge), then Anna's Archive if an
|
||||||
/// download API as a fallback.
|
/// API key is configured via `ANNAS_ARCHIVE_KEY`.
|
||||||
fn download_pdf(doi: &str) -> anyhow::Result<Vec<u8>> {
|
fn download_pdf(doi: &str) -> anyhow::Result<Vec<u8>> {
|
||||||
let client = http_client()?;
|
let client = http_client()?;
|
||||||
|
|
||||||
// Try LibGen first.
|
// Try Unpaywall first (free open-access).
|
||||||
|
match download_via_unpaywall(&client, doi) {
|
||||||
|
Ok(bytes) => return Ok(bytes),
|
||||||
|
Err(e) => eprintln!("Unpaywall: {e:#}"),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try LibGen.
|
||||||
match download_via_libgen(&client, doi) {
|
match download_via_libgen(&client, doi) {
|
||||||
Ok(bytes) => return Ok(bytes),
|
Ok(bytes) => return Ok(bytes),
|
||||||
Err(e) => eprintln!("LibGen failed: {e:#}"),
|
Err(e) => eprintln!("LibGen failed: {e:#}"),
|
||||||
|
|
@ -276,6 +282,50 @@ fn download_via_annas_archive(
|
||||||
Ok(bytes.to_vec())
|
Ok(bytes.to_vec())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// -- Unpaywall ---------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Try downloading an open-access PDF via the Unpaywall API.
|
||||||
|
fn download_via_unpaywall(
|
||||||
|
client: &reqwest::blocking::Client,
|
||||||
|
doi: &str,
|
||||||
|
) -> anyhow::Result<Vec<u8>> {
|
||||||
|
let email = std::env::var("UNPAYWALL_EMAIL")
|
||||||
|
.context("UNPAYWALL_EMAIL not set")?;
|
||||||
|
|
||||||
|
eprintln!("Checking Unpaywall for open-access PDF…");
|
||||||
|
|
||||||
|
let api_url = format!(
|
||||||
|
"https://api.unpaywall.org/v2/{doi}?email={email}"
|
||||||
|
);
|
||||||
|
|
||||||
|
let resp: serde_json::Value = client
|
||||||
|
.get(&api_url)
|
||||||
|
.send()
|
||||||
|
.context("failed to call Unpaywall API")?
|
||||||
|
.error_for_status()
|
||||||
|
.context("Unpaywall API returned an error status")?
|
||||||
|
.json()
|
||||||
|
.context("failed to parse Unpaywall API response")?;
|
||||||
|
|
||||||
|
let pdf_url = resp
|
||||||
|
.get("best_oa_location")
|
||||||
|
.and_then(|loc| loc.get("url_for_pdf"))
|
||||||
|
.and_then(|u| u.as_str())
|
||||||
|
.context("no open-access PDF available via Unpaywall")?;
|
||||||
|
|
||||||
|
eprintln!("Downloading open-access PDF from {pdf_url}");
|
||||||
|
let bytes = client
|
||||||
|
.get(pdf_url)
|
||||||
|
.send()
|
||||||
|
.context("failed to download from Unpaywall PDF URL")?
|
||||||
|
.error_for_status()
|
||||||
|
.context("Unpaywall PDF URL returned an error status")?
|
||||||
|
.bytes()?;
|
||||||
|
|
||||||
|
validate_pdf(&bytes)?;
|
||||||
|
Ok(bytes.to_vec())
|
||||||
|
}
|
||||||
|
|
||||||
// -- Helpers ----------------------------------------------------------------
|
// -- Helpers ----------------------------------------------------------------
|
||||||
|
|
||||||
fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> {
|
fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue