Add paper CLI: download academic papers by DOI and convert to markdown

Downloads PDFs from LibGen (primary) or Anna's Archive API (fallback),
converts to markdown via marker_single, and prints to stdout. Includes
XDG-compliant caching, nix flake with marker-pdf packaging, and a
Claude Code skill for paper-reader integration.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Ellie 2026-02-19 22:54:30 -08:00
commit f82b738db7
10 changed files with 2860 additions and 0 deletions

136
nix/marker.nix Normal file
View file

@ -0,0 +1,136 @@
# Nix expressions for marker-pdf and its missing dependencies.
{ pkgs }:
let
python3Packages = pkgs.python3Packages;
# pypdfium2 4.30.0 — pinned because pdftext and surya-ocr require v4.x API.
# Installed from the manylinux wheel which bundles libpdfium.
pypdfium2 = python3Packages.buildPythonPackage rec {
pname = "pypdfium2";
version = "4.30.0";
format = "wheel";
src = pkgs.fetchurl {
url = "https://files.pythonhosted.org/packages/65/cd/3f1edf20a0ef4a212a5e20a5900e64942c5a374473671ac0780eaa08ea80/pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl";
hash = "sha256-8feNIYng3fmsK3qbm9Twxm9U0Tif9sF+n9ncA00G6z8=";
};
nativeBuildInputs = [ pkgs.autoPatchelfHook ];
buildInputs = [ pkgs.stdenv.cc.cc.lib ];
pythonImportsCheck = [ "pypdfium2" ];
};
pdftext = python3Packages.buildPythonPackage rec {
pname = "pdftext";
version = "0.6.3";
pyproject = true;
src = pkgs.fetchPypi {
inherit pname version;
hash = "sha256-q1xd/g8ft43h24N8ytrB6kGwfOGJD+rZc8moTNr1Tew=";
};
build-system = [ python3Packages.poetry-core ];
nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
pythonRelaxDeps = true;
dependencies = [
pypdfium2
python3Packages.pydantic
python3Packages.pydantic-settings
python3Packages.click
];
# Tests require PDF fixtures not included in the sdist
doCheck = false;
pythonImportsCheck = [ "pdftext" ];
};
surya-ocr = python3Packages.buildPythonPackage rec {
pname = "surya-ocr";
version = "0.17.1";
pyproject = true;
src = pkgs.fetchPypi {
pname = "surya_ocr";
inherit version;
hash = "sha256-NJ142FTB7V+Bblg1Re1kUaoLxpkuKDqAUDR5mqzujCQ=";
};
build-system = [ python3Packages.poetry-core ];
nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
pythonRelaxDeps = true;
pythonRemoveDeps = [ "pre-commit" ];
dependencies = [
python3Packages.transformers
python3Packages.torch
python3Packages.pydantic
python3Packages.pydantic-settings
python3Packages.python-dotenv
python3Packages.pillow
pypdfium2
python3Packages.filetype
python3Packages.click
python3Packages.platformdirs
python3Packages.opencv-python-headless
python3Packages.einops
];
# Tests require model weights and GPU
doCheck = false;
pythonImportsCheck = [ "surya" ];
};
marker-pdf = python3Packages.buildPythonPackage rec {
pname = "marker-pdf";
version = "1.10.2";
pyproject = true;
src = pkgs.fetchPypi {
pname = "marker_pdf";
inherit version;
hash = "sha256-zg/IOeEa11GaV20lTKnVGg+UVLnX2gIhH3IrFBMX+fE=";
};
build-system = [ python3Packages.poetry-core ];
nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
pythonRelaxDeps = true;
pythonRemoveDeps = [ "pre-commit" ];
dependencies = [
python3Packages.pillow
python3Packages.pydantic
python3Packages.pydantic-settings
python3Packages.transformers
python3Packages.python-dotenv
python3Packages.torch
python3Packages.tqdm
python3Packages.ftfy
python3Packages.rapidfuzz
surya-ocr
python3Packages.regex
pdftext
python3Packages.markdownify
python3Packages.click
python3Packages.markdown2
python3Packages.filetype
python3Packages.google-genai
python3Packages.anthropic
python3Packages.scikit-learn
python3Packages.openai
];
# Tests require model weights
doCheck = false;
pythonImportsCheck = [ "marker" ];
};
in
{
inherit pypdfium2 pdftext surya-ocr marker-pdf;
# Python environment with marker_single on PATH
markerEnv = python3Packages.python.withPackages (_: [ marker-pdf ]);
}