Add paper CLI: download academic papers by DOI and convert to markdown

Downloads PDFs from LibGen (primary) or Anna's Archive API (fallback), converts to markdown via marker_single, and prints to stdout. Includes XDG-compliant caching, nix flake with marker-pdf packaging, and a Claude Code skill for paper-reader integration. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 22:54:30 -08:00 · 2026-02-19 22:54:30 -08:00 · f82b738db7
commit f82b738db7
10 changed files with 2860 additions and 0 deletions
--- a/.envrc
+++ b/.envrc
@ -0,0 +1 @@
+use flake .
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+/target
+/result
+.direnv/
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,12 @@
+[package]
+name = "paper"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+anyhow = "1"
+clap = { version = "4", features = ["derive"] }
+reqwest = { version = "0.12", features = ["blocking", "rustls-tls", "json"], default-features = false }
+scraper = "0.22"
+serde_json = "1"
+tempfile = "3"
--- a/docs/plans/2026-02-19-paper-cli-design.md
+++ b/docs/plans/2026-02-19-paper-cli-design.md
@ -0,0 +1,49 @@
+# paper CLI — Design
+
+A CLI tool that downloads academic papers by DOI from Anna's Archive and converts them to markdown.
+
+## CLI Interface
+
+```
+paper <DOI>
+```
+
+Single positional argument. Markdown output goes to stdout.
+
+```
+paper 10.1038/nature12373 > paper.md
+```
+
+## Download Flow
+
+1. Request `https://annas-archive.org/scidb/<DOI>` with a browser-like User-Agent
+2. Parse HTML for `<iframe>` or `<embed>` with `id="pdf"` — extract `src` for direct PDF URL
+3. Fallback: find any link ending in `.pdf`
+4. Download PDF to a temp file
+5. Exit with clear error if no PDF found
+
+## Conversion
+
+1. Shell out to `marker_single <tempfile.pdf> --output_dir <tempdir>`
+2. Read the generated `.md` file from the output dir
+3. Print to stdout
+4. Clean up temp dir
+
+## Error Handling
+
+- `marker_single` not on PATH: tell user to install (`pip install marker-pdf`)
+- Conversion failure: forward marker's stderr
+- Network errors: surface reqwest errors clearly
+- No PDF found: specific error message with the DOI
+
+## Dependencies
+
+- `clap` — argument parsing
+- `reqwest` (blocking, rustls-tls) — HTTP
+- `scraper` — HTML parsing
+- `tempfile` — temp directory
+- `anyhow` — error handling
+
+## Dev Environment
+
+The nix flake includes Rust nightly toolchain and marker-pdf in the devshell.
--- a/flake.lock
+++ b/flake.lock
@ -0,0 +1,82 @@
+{
+  "nodes": {
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1771207753,
+        "narHash": "sha256-b9uG8yN50DRQ6A7JdZBfzq718ryYrlmGgqkRm9OOwCE=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "d1c15b7d5806069da59e819999d70e1cec0760bf",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixpkgs-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "flake-utils": "flake-utils",
+        "nixpkgs": "nixpkgs",
+        "rust-overlay": "rust-overlay"
+      }
+    },
+    "rust-overlay": {
+      "inputs": {
+        "nixpkgs": [
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1771470520,
+        "narHash": "sha256-PvytHcaYN5cPUll7FB70mXv1rRsIBRmu47fFfq3haxA=",
+        "owner": "oxalica",
+        "repo": "rust-overlay",
+        "rev": "a1d4cc1f264c45d3745af0d2ca5e59d460e58777",
+        "type": "github"
+      },
+      "original": {
+        "owner": "oxalica",
+        "repo": "rust-overlay",
+        "type": "github"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
--- a/flake.nix
+++ b/flake.nix
@ -0,0 +1,55 @@
+{
+  description = "paper — download papers by DOI and convert to markdown";
+
+  inputs = {
+    nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
+    flake-utils.url = "github:numtide/flake-utils";
+    rust-overlay = {
+      url = "github:oxalica/rust-overlay";
+      inputs.nixpkgs.follows = "nixpkgs";
+    };
+  };
+
+  outputs = { self, nixpkgs, flake-utils, rust-overlay }:
+    flake-utils.lib.eachDefaultSystem (system:
+      let
+        overlays = [ (import rust-overlay) ];
+        pkgs = import nixpkgs { inherit system overlays; };
+        rust-nightly = pkgs.rust-bin.nightly.latest.default.override {
+          extensions = [ "rust-src" "rust-analyzer" ];
+        };
+
+        marker = import ./nix/marker.nix { inherit pkgs; };
+
+        paper = pkgs.rustPlatform.buildRustPackage {
+          pname = "paper";
+          version = "0.1.0";
+          src = pkgs.lib.cleanSource ./.;
+          cargoLock.lockFile = ./Cargo.lock;
+        };
+
+        # Wrap the paper binary so marker_single is on PATH
+        paper-wrapped = pkgs.symlinkJoin {
+          name = "paper-${paper.version}";
+          paths = [ paper ];
+          nativeBuildInputs = [ pkgs.makeWrapper ];
+          postBuild = ''
+            wrapProgram $out/bin/paper \
+              --prefix PATH : ${pkgs.lib.makeBinPath [ marker.markerEnv ]}
+          '';
+        };
+      in
+      {
+        packages = {
+          default = paper-wrapped;
+          unwrapped = paper;
+        };
+
+        devShells.default = pkgs.mkShell {
+          buildInputs = [
+            rust-nightly
+            marker.markerEnv
+          ];
+        };
+      });
+}
--- a/nix/marker.nix
+++ b/nix/marker.nix
@ -0,0 +1,136 @@
+# Nix expressions for marker-pdf and its missing dependencies.
+{ pkgs }:
+
+let
+  python3Packages = pkgs.python3Packages;
+
+  # pypdfium2 4.30.0 — pinned because pdftext and surya-ocr require v4.x API.
+  # Installed from the manylinux wheel which bundles libpdfium.
+  pypdfium2 = python3Packages.buildPythonPackage rec {
+    pname = "pypdfium2";
+    version = "4.30.0";
+    format = "wheel";
+
+    src = pkgs.fetchurl {
+      url = "https://files.pythonhosted.org/packages/65/cd/3f1edf20a0ef4a212a5e20a5900e64942c5a374473671ac0780eaa08ea80/pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl";
+      hash = "sha256-8feNIYng3fmsK3qbm9Twxm9U0Tif9sF+n9ncA00G6z8=";
+    };
+
+    nativeBuildInputs = [ pkgs.autoPatchelfHook ];
+    buildInputs = [ pkgs.stdenv.cc.cc.lib ];
+
+    pythonImportsCheck = [ "pypdfium2" ];
+  };
+
+  pdftext = python3Packages.buildPythonPackage rec {
+    pname = "pdftext";
+    version = "0.6.3";
+    pyproject = true;
+
+    src = pkgs.fetchPypi {
+      inherit pname version;
+      hash = "sha256-q1xd/g8ft43h24N8ytrB6kGwfOGJD+rZc8moTNr1Tew=";
+    };
+
+    build-system = [ python3Packages.poetry-core ];
+    nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
+    pythonRelaxDeps = true;
+
+    dependencies = [
+      pypdfium2
+      python3Packages.pydantic
+      python3Packages.pydantic-settings
+      python3Packages.click
+    ];
+
+    # Tests require PDF fixtures not included in the sdist
+    doCheck = false;
+    pythonImportsCheck = [ "pdftext" ];
+  };
+
+  surya-ocr = python3Packages.buildPythonPackage rec {
+    pname = "surya-ocr";
+    version = "0.17.1";
+    pyproject = true;
+
+    src = pkgs.fetchPypi {
+      pname = "surya_ocr";
+      inherit version;
+      hash = "sha256-NJ142FTB7V+Bblg1Re1kUaoLxpkuKDqAUDR5mqzujCQ=";
+    };
+
+    build-system = [ python3Packages.poetry-core ];
+    nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
+    pythonRelaxDeps = true;
+    pythonRemoveDeps = [ "pre-commit" ];
+
+    dependencies = [
+      python3Packages.transformers
+      python3Packages.torch
+      python3Packages.pydantic
+      python3Packages.pydantic-settings
+      python3Packages.python-dotenv
+      python3Packages.pillow
+      pypdfium2
+      python3Packages.filetype
+      python3Packages.click
+      python3Packages.platformdirs
+      python3Packages.opencv-python-headless
+      python3Packages.einops
+    ];
+
+    # Tests require model weights and GPU
+    doCheck = false;
+    pythonImportsCheck = [ "surya" ];
+  };
+
+  marker-pdf = python3Packages.buildPythonPackage rec {
+    pname = "marker-pdf";
+    version = "1.10.2";
+    pyproject = true;
+
+    src = pkgs.fetchPypi {
+      pname = "marker_pdf";
+      inherit version;
+      hash = "sha256-zg/IOeEa11GaV20lTKnVGg+UVLnX2gIhH3IrFBMX+fE=";
+    };
+
+    build-system = [ python3Packages.poetry-core ];
+    nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
+    pythonRelaxDeps = true;
+    pythonRemoveDeps = [ "pre-commit" ];
+
+    dependencies = [
+      python3Packages.pillow
+      python3Packages.pydantic
+      python3Packages.pydantic-settings
+      python3Packages.transformers
+      python3Packages.python-dotenv
+      python3Packages.torch
+      python3Packages.tqdm
+      python3Packages.ftfy
+      python3Packages.rapidfuzz
+      surya-ocr
+      python3Packages.regex
+      pdftext
+      python3Packages.markdownify
+      python3Packages.click
+      python3Packages.markdown2
+      python3Packages.filetype
+      python3Packages.google-genai
+      python3Packages.anthropic
+      python3Packages.scikit-learn
+      python3Packages.openai
+    ];
+
+    # Tests require model weights
+    doCheck = false;
+    pythonImportsCheck = [ "marker" ];
+  };
+in
+{
+  inherit pypdfium2 pdftext surya-ocr marker-pdf;
+
+  # Python environment with marker_single on PATH
+  markerEnv = python3Packages.python.withPackages (_: [ marker-pdf ]);
+}
--- a/skill/SKILL.md
+++ b/skill/SKILL.md
@ -0,0 +1,41 @@
+---
+name: paper-reader
+description: Fetch and read academic papers by DOI. Use when (1) the user mentions a DOI (e.g., 10.1038/nature12373), asks to read/summarize/analyze a research paper, or references a paper they want to work with, or (2) Claude needs to consult a specific paper as part of research — e.g., a web search returns a relevant DOI, or a cited paper would help answer the user's question. Converts PDFs to markdown so the paper content can be read and discussed.
+---
+
+# Paper Reader
+
+Fetch academic papers by DOI using the `paper` CLI, which downloads PDFs and converts them to markdown via `marker_single`.
+
+## Usage
+
+```bash
+# Fetch a paper and save to a temp file, then read it
+paper <DOI> > /tmp/paper.md
+
+# Bypass cache to re-download
+paper --no-cache <DOI> > /tmp/paper.md
+```
+
+Output goes to stdout (markdown). Progress/status goes to stderr. DOIs can be passed with or without the `https://doi.org/` prefix.
+
+## Workflow
+
+1. Extract the DOI from the user's message (look for patterns like `10.xxxx/...`)
+2. Run `paper <DOI> > /tmp/paper-<sanitized-doi>.md` via Bash
+3. Read the resulting markdown file
+4. Respond to what the user asked (summarize, explain, answer questions, etc.)
+
+## Caching
+
+Results are cached at `~/.cache/paper/<DOI>.md`. Subsequent requests for the same DOI return instantly. Use `--no-cache` only when the user explicitly wants a fresh conversion.
+
+## Download Sources
+
+The tool tries LibGen first (free, no authentication), then falls back to Anna's Archive fast download API if `ANNAS_ARCHIVE_KEY` is set.
+
+## Errors
+
+- **"marker_single not found"**: The `marker_single` Python tool is not installed. Run `pip install marker-pdf` or use the nix devshell in `~/proj/paper`.
+- **"no results found on LibGen"**: The DOI may not be in LibGen's collection. Verify the DOI is correct.
+- **"all download sources failed"**: Neither LibGen nor Anna's Archive had the paper. The user may need to find it manually.
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,344 @@
+use std::io::{self, Read};
+use std::path::PathBuf;
+use std::process::Command;
+
+use anyhow::{Context, bail};
+use clap::Parser;
+use scraper::{Html, Selector};
+
+const USER_AGENT: &str =
+    "Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0";
+const LIBGEN_BASE: &str = "https://libgen.li";
+
+#[derive(Parser)]
+#[command(about = "Download a paper by DOI and convert it to markdown")]
+struct Args {
+    /// The DOI of the paper to download
+    doi: String,
+
+    /// Skip the cache and re-download/re-convert
+    #[arg(long)]
+    no_cache: bool,
+}
+
+fn main() -> anyhow::Result<()> {
+    let args = Args::parse();
+    let doi = args.doi.trim_start_matches("https://doi.org/");
+
+    if !args.no_cache {
+        if let Some(cached) = read_cache(doi) {
+            print!("{cached}");
+            return Ok(());
+        }
+    }
+
+    let pdf_bytes = download_pdf(doi)?;
+    let markdown = convert_to_markdown(&pdf_bytes)?;
+
+    write_cache(doi, &markdown);
+    print!("{markdown}");
+    Ok(())
+}
+
+// ---------------------------------------------------------------------------
+// Cache
+// ---------------------------------------------------------------------------
+
+/// Return the cache directory: `$XDG_CACHE_HOME/paper` or `~/.cache/paper`.
+fn cache_dir() -> Option<PathBuf> {
+    let base = std::env::var_os("XDG_CACHE_HOME")
+        .map(PathBuf::from)
+        .or_else(|| std::env::var_os("HOME").map(|h| PathBuf::from(h).join(".cache")))?;
+    Some(base.join("paper"))
+}
+
+/// Path to a cached markdown file for a given DOI.
+/// DOI `10.1038/nature12373` maps to `<cache>/10.1038/nature12373.md`.
+fn cache_path(doi: &str) -> Option<PathBuf> {
+    cache_dir().map(|d| d.join(format!("{doi}.md")))
+}
+
+fn read_cache(doi: &str) -> Option<String> {
+    let path = cache_path(doi)?;
+    match std::fs::read_to_string(&path) {
+        Ok(content) => {
+            eprintln!("Using cached result from {}", path.display());
+            Some(content)
+        }
+        Err(_) => None,
+    }
+}
+
+fn write_cache(doi: &str, markdown: &str) {
+    let Some(path) = cache_path(doi) else { return };
+    if let Some(parent) = path.parent() {
+        let _ = std::fs::create_dir_all(parent);
+    }
+    let _ = std::fs::write(&path, markdown);
+}
+
+// ---------------------------------------------------------------------------
+// Download
+// ---------------------------------------------------------------------------
+
+fn http_client() -> anyhow::Result<reqwest::blocking::Client> {
+    Ok(reqwest::blocking::Client::builder()
+        .user_agent(USER_AGENT)
+        .build()?)
+}
+
+/// Download a paper PDF by DOI.
+///
+/// Tries LibGen first (free, no JS challenge).  If that fails and an Anna's
+/// Archive API key is configured via `ANNAS_ARCHIVE_KEY`, tries the fast
+/// download API as a fallback.
+fn download_pdf(doi: &str) -> anyhow::Result<Vec<u8>> {
+    let client = http_client()?;
+
+    // Try LibGen first.
+    match download_via_libgen(&client, doi) {
+        Ok(bytes) => return Ok(bytes),
+        Err(e) => eprintln!("LibGen failed: {e:#}"),
+    }
+
+    // Try Anna's Archive fast download API if a key is available.
+    // This requires an MD5 — attempt to resolve one from LibGen even if the
+    // download itself failed (the search may have worked).
+    if let Ok(key) = std::env::var("ANNAS_ARCHIVE_KEY") {
+        if let Some(md5) = resolve_md5_from_libgen(&client, doi) {
+            match download_via_annas_archive(&client, &md5, &key) {
+                Ok(bytes) => return Ok(bytes),
+                Err(e) => eprintln!("Anna's Archive API failed: {e:#}"),
+            }
+        }
+    }
+
+    bail!("all download sources failed for DOI {doi}")
+}
+
+// -- LibGen -----------------------------------------------------------------
+
+/// Resolve a DOI to a paper MD5 via LibGen search + edition page.
+fn resolve_md5_from_libgen(
+    client: &reqwest::blocking::Client,
+    doi: &str,
+) -> Option<String> {
+    let edition_id = libgen_search(client, doi).ok()?;
+    libgen_edition_md5(client, &edition_id).ok()
+}
+
+/// Download a paper PDF from LibGen by DOI.
+fn download_via_libgen(
+    client: &reqwest::blocking::Client,
+    doi: &str,
+) -> anyhow::Result<Vec<u8>> {
+    eprintln!("Searching LibGen for DOI {doi}");
+    let edition_id = libgen_search(client, doi)?;
+
+    eprintln!("Found edition {edition_id}, resolving download link…");
+    let md5 = libgen_edition_md5(client, &edition_id)?;
+    let download_key = libgen_download_key(client, &md5)?;
+
+    let download_url = format!("{LIBGEN_BASE}/get.php?md5={md5}&key={download_key}");
+    eprintln!("Downloading PDF…");
+
+    let bytes = client
+        .get(&download_url)
+        .send()
+        .context("failed to request PDF from LibGen")?
+        .bytes()
+        .context("failed to read PDF body")?;
+
+    validate_pdf(&bytes)?;
+    Ok(bytes.to_vec())
+}
+
+/// Search LibGen by DOI and return the first matching edition ID.
+fn libgen_search(
+    client: &reqwest::blocking::Client,
+    doi: &str,
+) -> anyhow::Result<String> {
+    let url = format!("{LIBGEN_BASE}/index.php?req={doi}&topics%5B%5D=a&res=25");
+    let html = client
+        .get(&url)
+        .send()
+        .context("failed to search LibGen")?
+        .text()?;
+
+    let doc = Html::parse_document(&html);
+    let sel =
+        Selector::parse("a[href*='edition.php?id=']").expect("valid selector");
+
+    for el in doc.select(&sel) {
+        if let Some(href) = el.value().attr("href") {
+            if let Some(id) = href.strip_prefix("edition.php?id=") {
+                return Ok(id.to_string());
+            }
+        }
+    }
+    bail!("no results found on LibGen for DOI {doi}")
+}
+
+/// Fetch a LibGen edition page and extract the file's MD5.
+fn libgen_edition_md5(
+    client: &reqwest::blocking::Client,
+    edition_id: &str,
+) -> anyhow::Result<String> {
+    let url = format!("{LIBGEN_BASE}/edition.php?id={edition_id}");
+    let html = client.get(&url).send()?.text()?;
+
+    let doc = Html::parse_document(&html);
+    let sel = Selector::parse("a[href*='ads.php?md5=']").expect("valid selector");
+
+    for el in doc.select(&sel) {
+        if let Some(href) = el.value().attr("href") {
+            if let Some(rest) = href.strip_prefix("ads.php?md5=") {
+                // href may have extra params after the md5
+                let md5 = rest.split('&').next().unwrap_or(rest);
+                return Ok(md5.to_string());
+            }
+            if let Some(rest) = href.strip_prefix("/ads.php?md5=") {
+                let md5 = rest.split('&').next().unwrap_or(rest);
+                return Ok(md5.to_string());
+            }
+        }
+    }
+    bail!("no download link found on edition page {edition_id}")
+}
+
+/// Fetch the LibGen ads/download page for an MD5 and extract the one-time
+/// download key.
+fn libgen_download_key(
+    client: &reqwest::blocking::Client,
+    md5: &str,
+) -> anyhow::Result<String> {
+    let url = format!("{LIBGEN_BASE}/ads.php?md5={md5}");
+    let html = client.get(&url).send()?.text()?;
+
+    let doc = Html::parse_document(&html);
+    let sel = Selector::parse("a[href*='get.php?md5=']").expect("valid selector");
+
+    for el in doc.select(&sel) {
+        if let Some(href) = el.value().attr("href") {
+            // Extract key= param from the get.php link
+            if let Some(idx) = href.find("key=") {
+                let key = &href[idx + 4..];
+                let key = key.split('&').next().unwrap_or(key);
+                if !key.is_empty() {
+                    return Ok(key.to_string());
+                }
+            }
+        }
+    }
+    bail!("no download key found on LibGen ads page for md5 {md5}")
+}
+
+// -- Anna's Archive ---------------------------------------------------------
+
+/// Download a paper PDF via the Anna's Archive fast download JSON API.
+fn download_via_annas_archive(
+    client: &reqwest::blocking::Client,
+    md5: &str,
+    key: &str,
+) -> anyhow::Result<Vec<u8>> {
+    eprintln!("Trying Anna's Archive fast download API…");
+
+    let api_url = format!(
+        "https://annas-archive.li/dyn/api/fast_download.json?md5={md5}&key={key}"
+    );
+
+    let resp: serde_json::Value = client
+        .get(&api_url)
+        .send()
+        .context("failed to call Anna's Archive API")?
+        .json()
+        .context("failed to parse Anna's Archive API response")?;
+
+    if let Some(err) = resp.get("error").and_then(|e| e.as_str()) {
+        if !err.is_empty() {
+            bail!("Anna's Archive API error: {err}");
+        }
+    }
+
+    let download_url = resp
+        .get("download_url")
+        .and_then(|u| u.as_str())
+        .context("no download_url in Anna's Archive API response")?;
+
+    eprintln!("Downloading PDF from Anna's Archive…");
+    let bytes = client
+        .get(download_url)
+        .send()
+        .context("failed to download from Anna's Archive")?
+        .bytes()?;
+
+    validate_pdf(&bytes)?;
+    Ok(bytes.to_vec())
+}
+
+// -- Helpers ----------------------------------------------------------------
+
+fn validate_pdf(bytes: &[u8]) -> anyhow::Result<()> {
+    if bytes.len() < 1024 {
+        bail!(
+            "downloaded file is suspiciously small ({} bytes) — may not be a valid PDF",
+            bytes.len()
+        );
+    }
+    Ok(())
+}
+
+// ---------------------------------------------------------------------------
+// Conversion
+// ---------------------------------------------------------------------------
+
+/// Write PDF bytes to a temp file, run marker_single, and return the
+/// resulting markdown.
+fn convert_to_markdown(pdf_bytes: &[u8]) -> anyhow::Result<String> {
+    let tmp_dir = tempfile::tempdir().context("failed to create temp directory")?;
+    let pdf_path = tmp_dir.path().join("paper.pdf");
+    let out_dir = tmp_dir.path().join("output");
+
+    std::fs::write(&pdf_path, pdf_bytes).context("failed to write temp PDF")?;
+
+    eprintln!("Converting PDF to markdown…");
+
+    let status = Command::new("marker_single")
+        .arg(&pdf_path)
+        .arg("--output_dir")
+        .arg(&out_dir)
+        .arg("--output_format")
+        .arg("markdown")
+        .status();
+
+    match status {
+        Err(e) if e.kind() == io::ErrorKind::NotFound => {
+            bail!("marker_single not found on PATH. Install it with:\n  pip install marker-pdf");
+        }
+        Err(e) => bail!("failed to run marker_single: {e}"),
+        Ok(s) if !s.success() => bail!("marker_single exited with {s}"),
+        Ok(_) => {}
+    }
+
+    // marker_single creates a subdirectory inside our output dir — find
+    // the .md file within it.
+    find_markdown_file(&out_dir)
+}
+
+/// Recursively search a directory for the first .md file and read it.
+fn find_markdown_file(dir: &std::path::Path) -> anyhow::Result<String> {
+    for entry in std::fs::read_dir(dir).context("failed to read marker output directory")? {
+        let entry = entry?;
+        let path = entry.path();
+        if path.is_dir() {
+            if let Ok(md) = find_markdown_file(&path) {
+                return Ok(md);
+            }
+        } else if path.extension().is_some_and(|ext| ext == "md") {
+            let mut content = String::new();
+            std::fs::File::open(&path)?.read_to_string(&mut content)?;
+            return Ok(content);
+        }
+    }
+    bail!("no .md file found in marker output")
+}