Add paper CLI: download academic papers by DOI and convert to markdown
Downloads PDFs from LibGen (primary) or Anna's Archive API (fallback), converts to markdown via marker_single, and prints to stdout. Includes XDG-compliant caching, nix flake with marker-pdf packaging, and a Claude Code skill for paper-reader integration. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
commit
f82b738db7
10 changed files with 2860 additions and 0 deletions
136
nix/marker.nix
Normal file
136
nix/marker.nix
Normal file
|
|
@ -0,0 +1,136 @@
|
|||
# Nix expressions for marker-pdf and its missing dependencies.
|
||||
{ pkgs }:
|
||||
|
||||
let
|
||||
python3Packages = pkgs.python3Packages;
|
||||
|
||||
# pypdfium2 4.30.0 — pinned because pdftext and surya-ocr require v4.x API.
|
||||
# Installed from the manylinux wheel which bundles libpdfium.
|
||||
pypdfium2 = python3Packages.buildPythonPackage rec {
|
||||
pname = "pypdfium2";
|
||||
version = "4.30.0";
|
||||
format = "wheel";
|
||||
|
||||
src = pkgs.fetchurl {
|
||||
url = "https://files.pythonhosted.org/packages/65/cd/3f1edf20a0ef4a212a5e20a5900e64942c5a374473671ac0780eaa08ea80/pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl";
|
||||
hash = "sha256-8feNIYng3fmsK3qbm9Twxm9U0Tif9sF+n9ncA00G6z8=";
|
||||
};
|
||||
|
||||
nativeBuildInputs = [ pkgs.autoPatchelfHook ];
|
||||
buildInputs = [ pkgs.stdenv.cc.cc.lib ];
|
||||
|
||||
pythonImportsCheck = [ "pypdfium2" ];
|
||||
};
|
||||
|
||||
pdftext = python3Packages.buildPythonPackage rec {
|
||||
pname = "pdftext";
|
||||
version = "0.6.3";
|
||||
pyproject = true;
|
||||
|
||||
src = pkgs.fetchPypi {
|
||||
inherit pname version;
|
||||
hash = "sha256-q1xd/g8ft43h24N8ytrB6kGwfOGJD+rZc8moTNr1Tew=";
|
||||
};
|
||||
|
||||
build-system = [ python3Packages.poetry-core ];
|
||||
nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
|
||||
pythonRelaxDeps = true;
|
||||
|
||||
dependencies = [
|
||||
pypdfium2
|
||||
python3Packages.pydantic
|
||||
python3Packages.pydantic-settings
|
||||
python3Packages.click
|
||||
];
|
||||
|
||||
# Tests require PDF fixtures not included in the sdist
|
||||
doCheck = false;
|
||||
pythonImportsCheck = [ "pdftext" ];
|
||||
};
|
||||
|
||||
surya-ocr = python3Packages.buildPythonPackage rec {
|
||||
pname = "surya-ocr";
|
||||
version = "0.17.1";
|
||||
pyproject = true;
|
||||
|
||||
src = pkgs.fetchPypi {
|
||||
pname = "surya_ocr";
|
||||
inherit version;
|
||||
hash = "sha256-NJ142FTB7V+Bblg1Re1kUaoLxpkuKDqAUDR5mqzujCQ=";
|
||||
};
|
||||
|
||||
build-system = [ python3Packages.poetry-core ];
|
||||
nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
|
||||
pythonRelaxDeps = true;
|
||||
pythonRemoveDeps = [ "pre-commit" ];
|
||||
|
||||
dependencies = [
|
||||
python3Packages.transformers
|
||||
python3Packages.torch
|
||||
python3Packages.pydantic
|
||||
python3Packages.pydantic-settings
|
||||
python3Packages.python-dotenv
|
||||
python3Packages.pillow
|
||||
pypdfium2
|
||||
python3Packages.filetype
|
||||
python3Packages.click
|
||||
python3Packages.platformdirs
|
||||
python3Packages.opencv-python-headless
|
||||
python3Packages.einops
|
||||
];
|
||||
|
||||
# Tests require model weights and GPU
|
||||
doCheck = false;
|
||||
pythonImportsCheck = [ "surya" ];
|
||||
};
|
||||
|
||||
marker-pdf = python3Packages.buildPythonPackage rec {
|
||||
pname = "marker-pdf";
|
||||
version = "1.10.2";
|
||||
pyproject = true;
|
||||
|
||||
src = pkgs.fetchPypi {
|
||||
pname = "marker_pdf";
|
||||
inherit version;
|
||||
hash = "sha256-zg/IOeEa11GaV20lTKnVGg+UVLnX2gIhH3IrFBMX+fE=";
|
||||
};
|
||||
|
||||
build-system = [ python3Packages.poetry-core ];
|
||||
nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
|
||||
pythonRelaxDeps = true;
|
||||
pythonRemoveDeps = [ "pre-commit" ];
|
||||
|
||||
dependencies = [
|
||||
python3Packages.pillow
|
||||
python3Packages.pydantic
|
||||
python3Packages.pydantic-settings
|
||||
python3Packages.transformers
|
||||
python3Packages.python-dotenv
|
||||
python3Packages.torch
|
||||
python3Packages.tqdm
|
||||
python3Packages.ftfy
|
||||
python3Packages.rapidfuzz
|
||||
surya-ocr
|
||||
python3Packages.regex
|
||||
pdftext
|
||||
python3Packages.markdownify
|
||||
python3Packages.click
|
||||
python3Packages.markdown2
|
||||
python3Packages.filetype
|
||||
python3Packages.google-genai
|
||||
python3Packages.anthropic
|
||||
python3Packages.scikit-learn
|
||||
python3Packages.openai
|
||||
];
|
||||
|
||||
# Tests require model weights
|
||||
doCheck = false;
|
||||
pythonImportsCheck = [ "marker" ];
|
||||
};
|
||||
in
|
||||
{
|
||||
inherit pypdfium2 pdftext surya-ocr marker-pdf;
|
||||
|
||||
# Python environment with marker_single on PATH
|
||||
markerEnv = python3Packages.python.withPackages (_: [ marker-pdf ]);
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue