Downloads PDFs from LibGen (primary) or Anna's Archive API (fallback), converts to markdown via marker_single, and prints to stdout. Includes XDG-compliant caching, nix flake with marker-pdf packaging, and a Claude Code skill for paper-reader integration. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
136 lines
3.8 KiB
Nix
136 lines
3.8 KiB
Nix
# Nix expressions for marker-pdf and its missing dependencies.
|
|
{ pkgs }:
|
|
|
|
let
|
|
python3Packages = pkgs.python3Packages;
|
|
|
|
# pypdfium2 4.30.0 — pinned because pdftext and surya-ocr require v4.x API.
|
|
# Installed from the manylinux wheel which bundles libpdfium.
|
|
pypdfium2 = python3Packages.buildPythonPackage rec {
|
|
pname = "pypdfium2";
|
|
version = "4.30.0";
|
|
format = "wheel";
|
|
|
|
src = pkgs.fetchurl {
|
|
url = "https://files.pythonhosted.org/packages/65/cd/3f1edf20a0ef4a212a5e20a5900e64942c5a374473671ac0780eaa08ea80/pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl";
|
|
hash = "sha256-8feNIYng3fmsK3qbm9Twxm9U0Tif9sF+n9ncA00G6z8=";
|
|
};
|
|
|
|
nativeBuildInputs = [ pkgs.autoPatchelfHook ];
|
|
buildInputs = [ pkgs.stdenv.cc.cc.lib ];
|
|
|
|
pythonImportsCheck = [ "pypdfium2" ];
|
|
};
|
|
|
|
pdftext = python3Packages.buildPythonPackage rec {
|
|
pname = "pdftext";
|
|
version = "0.6.3";
|
|
pyproject = true;
|
|
|
|
src = pkgs.fetchPypi {
|
|
inherit pname version;
|
|
hash = "sha256-q1xd/g8ft43h24N8ytrB6kGwfOGJD+rZc8moTNr1Tew=";
|
|
};
|
|
|
|
build-system = [ python3Packages.poetry-core ];
|
|
nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
|
|
pythonRelaxDeps = true;
|
|
|
|
dependencies = [
|
|
pypdfium2
|
|
python3Packages.pydantic
|
|
python3Packages.pydantic-settings
|
|
python3Packages.click
|
|
];
|
|
|
|
# Tests require PDF fixtures not included in the sdist
|
|
doCheck = false;
|
|
pythonImportsCheck = [ "pdftext" ];
|
|
};
|
|
|
|
surya-ocr = python3Packages.buildPythonPackage rec {
|
|
pname = "surya-ocr";
|
|
version = "0.17.1";
|
|
pyproject = true;
|
|
|
|
src = pkgs.fetchPypi {
|
|
pname = "surya_ocr";
|
|
inherit version;
|
|
hash = "sha256-NJ142FTB7V+Bblg1Re1kUaoLxpkuKDqAUDR5mqzujCQ=";
|
|
};
|
|
|
|
build-system = [ python3Packages.poetry-core ];
|
|
nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
|
|
pythonRelaxDeps = true;
|
|
pythonRemoveDeps = [ "pre-commit" ];
|
|
|
|
dependencies = [
|
|
python3Packages.transformers
|
|
python3Packages.torch
|
|
python3Packages.pydantic
|
|
python3Packages.pydantic-settings
|
|
python3Packages.python-dotenv
|
|
python3Packages.pillow
|
|
pypdfium2
|
|
python3Packages.filetype
|
|
python3Packages.click
|
|
python3Packages.platformdirs
|
|
python3Packages.opencv-python-headless
|
|
python3Packages.einops
|
|
];
|
|
|
|
# Tests require model weights and GPU
|
|
doCheck = false;
|
|
pythonImportsCheck = [ "surya" ];
|
|
};
|
|
|
|
marker-pdf = python3Packages.buildPythonPackage rec {
|
|
pname = "marker-pdf";
|
|
version = "1.10.2";
|
|
pyproject = true;
|
|
|
|
src = pkgs.fetchPypi {
|
|
pname = "marker_pdf";
|
|
inherit version;
|
|
hash = "sha256-zg/IOeEa11GaV20lTKnVGg+UVLnX2gIhH3IrFBMX+fE=";
|
|
};
|
|
|
|
build-system = [ python3Packages.poetry-core ];
|
|
nativeBuildInputs = [ python3Packages.pythonRelaxDepsHook ];
|
|
pythonRelaxDeps = true;
|
|
pythonRemoveDeps = [ "pre-commit" ];
|
|
|
|
dependencies = [
|
|
python3Packages.pillow
|
|
python3Packages.pydantic
|
|
python3Packages.pydantic-settings
|
|
python3Packages.transformers
|
|
python3Packages.python-dotenv
|
|
python3Packages.torch
|
|
python3Packages.tqdm
|
|
python3Packages.ftfy
|
|
python3Packages.rapidfuzz
|
|
surya-ocr
|
|
python3Packages.regex
|
|
pdftext
|
|
python3Packages.markdownify
|
|
python3Packages.click
|
|
python3Packages.markdown2
|
|
python3Packages.filetype
|
|
python3Packages.google-genai
|
|
python3Packages.anthropic
|
|
python3Packages.scikit-learn
|
|
python3Packages.openai
|
|
];
|
|
|
|
# Tests require model weights
|
|
doCheck = false;
|
|
pythonImportsCheck = [ "marker" ];
|
|
};
|
|
in
|
|
{
|
|
inherit pypdfium2 pdftext surya-ocr marker-pdf;
|
|
|
|
# Python environment with marker_single on PATH
|
|
markerEnv = python3Packages.python.withPackages (_: [ marker-pdf ]);
|
|
}
|